From 722f8c51d8af223751dfb1d02de40043e8ba067e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 9 Dec 2019 11:46:13 -0500 Subject: [PATCH 01/32] virtio: add ability to delete vq through a pointer Devices tend to maintain vq pointers, allow deleting them trough a vq pointer. Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand Reviewed-by: David Hildenbrand --- hw/virtio/virtio.c | 15 ++++++++++----- include/hw/virtio/virtio.h | 2 ++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 04716b5f6c..31dd140990 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2330,17 +2330,22 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, return &vdev->vq[i]; } +void virtio_delete_queue(VirtQueue *vq) +{ + vq->vring.num = 0; + vq->vring.num_default = 0; + vq->handle_output = NULL; + vq->handle_aio_output = NULL; + g_free(vq->used_elems); +} + void virtio_del_queue(VirtIODevice *vdev, int n) { if (n < 0 || n >= VIRTIO_QUEUE_MAX) { abort(); } - vdev->vq[n].vring.num = 0; - vdev->vq[n].vring.num_default = 0; - vdev->vq[n].handle_output = NULL; - vdev->vq[n].handle_aio_output = NULL; - g_free(vdev->vq[n].used_elems); + virtio_delete_queue(&vdev->vq[n]); } static void virtio_set_isr(VirtIODevice *vdev, int value) diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c32a815303..e18756d50d 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -183,6 +183,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, void virtio_del_queue(VirtIODevice *vdev, int n); +void virtio_delete_queue(VirtQueue *vq); + void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len); void virtqueue_flush(VirtQueue *vq, unsigned int count); From 8cd353ea0fbf0e334e015d833f612799be642296 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 9 Dec 2019 11:47:24 -0500 Subject: [PATCH 02/32] virtio: make virtio_delete_queue idempotent Let's make sure calling this twice is harmless - no known instances, but seems safer. Suggested-by: Pan Nengyuan Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 31dd140990..6de3cfdc2c 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2337,6 +2337,7 @@ void virtio_delete_queue(VirtQueue *vq) vq->handle_output = NULL; vq->handle_aio_output = NULL; g_free(vq->used_elems); + vq->used_elems = NULL; } void virtio_del_queue(VirtIODevice *vdev, int n) From 3627842855a23f54b62b48560c9724c2bb3c4af3 Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Wed, 4 Dec 2019 15:31:55 +0800 Subject: [PATCH 03/32] virtio-balloon: fix memory leak while attach virtio-balloon device ivq/dvq/svq/free_page_vq is forgot to cleanup in virtio_balloon_device_unrealize, the memory leak stack is as follow: Direct leak of 14336 byte(s) in 2 object(s) allocated from: #0 0x7f99fd9d8560 in calloc (/usr/lib64/libasan.so.3+0xc7560) #1 0x7f99fcb20015 in g_malloc0 (/usr/lib64/libglib-2.0.so.0+0x50015) #2 0x557d90638437 in virtio_add_queue hw/virtio/virtio.c:2327 #3 0x557d9064401d in virtio_balloon_device_realize hw/virtio/virtio-balloon.c:793 #4 0x557d906356f7 in virtio_device_realize hw/virtio/virtio.c:3504 #5 0x557d9073f081 in device_set_realized hw/core/qdev.c:876 #6 0x557d908b1f4d in property_set_bool qom/object.c:2080 #7 0x557d908b655e in object_property_set_qobject qom/qom-qobject.c:26 Reported-by: Euler Robot Signed-off-by: Pan Nengyuan Message-Id: <1575444716-17632-2-git-send-email-pannengyuan@huawei.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand Reviewed-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand --- hw/virtio/virtio-balloon.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 40b04f5180..57f3b9f22d 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -831,6 +831,13 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp) } balloon_stats_destroy_timer(s); qemu_remove_balloon_handler(s); + + virtio_delete_queue(s->ivq); + virtio_delete_queue(s->dvq); + virtio_delete_queue(s->svq); + if (s->free_page_vq) { + virtio_delete_queue(s->free_page_vq); + } virtio_cleanup(vdev); } From e615c157f3d27ab42affba9046ae426a83f29874 Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Wed, 4 Dec 2019 15:31:56 +0800 Subject: [PATCH 04/32] virtio-serial-bus: fix memory leak while attach virtio-serial-bus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ivqs/ovqs/c_ivq/c_ovq is forgot to cleanup in virtio_serial_device_unrealize, the memory leak stack is as bellow: Direct leak of 1290240 byte(s) in 180 object(s) allocated from: #0 0x7fc9bfc27560 in calloc (/usr/lib64/libasan.so.3+0xc7560) #1 0x7fc9bed6f015 in g_malloc0 (/usr/lib64/libglib-2.0.so.0+0x50015) #2 0x5650e02b83e7 in virtio_add_queue hw/virtio/virtio.c:2327 #3 0x5650e02847b5 in virtio_serial_device_realize hw/char/virtio-serial-bus.c:1089 #4 0x5650e02b56a7 in virtio_device_realize hw/virtio/virtio.c:3504 #5 0x5650e03bf031 in device_set_realized hw/core/qdev.c:876 #6 0x5650e0531efd in property_set_bool qom/object.c:2080 #7 0x5650e053650e in object_property_set_qobject qom/qom-qobject.c:26 #8 0x5650e0533e14 in object_property_set_bool qom/object.c:1338 #9 0x5650e04c0e37 in virtio_pci_realize hw/virtio/virtio-pci.c:1801 Reported-by: Euler Robot Signed-off-by: Pan Nengyuan Cc: Laurent Vivier Cc: Amit Shah Cc: "Marc-André Lureau" Cc: Paolo Bonzini Message-Id: <1575444716-17632-3-git-send-email-pannengyuan@huawei.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin --- hw/char/virtio-serial-bus.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index 33259042a9..e1cbce3ba3 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -1126,9 +1126,17 @@ static void virtio_serial_device_unrealize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOSerial *vser = VIRTIO_SERIAL(dev); + int i; QLIST_REMOVE(vser, next); + virtio_delete_queue(vser->c_ivq); + virtio_delete_queue(vser->c_ovq); + for (i = 0; i < vser->bus.max_nr_ports; i++) { + virtio_delete_queue(vser->ivqs[i]); + virtio_delete_queue(vser->ovqs[i]); + } + g_free(vser->ivqs); g_free(vser->ovqs); g_free(vser->ports_map); From 087ed8a5827d10cd346ba5fa89845f14b9dff377 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 9 Dec 2019 11:55:10 -0500 Subject: [PATCH 05/32] virtio-input: convert to new virtio_delete_queue Seems cleaner than using VQ index values. Signed-off-by: Michael S. Tsirkin --- hw/input/virtio-input.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/input/virtio-input.c b/hw/input/virtio-input.c index ec54e46ad6..9c013afddb 100644 --- a/hw/input/virtio-input.c +++ b/hw/input/virtio-input.c @@ -280,6 +280,7 @@ static void virtio_input_device_unrealize(DeviceState *dev, Error **errp) { VirtIOInputClass *vic = VIRTIO_INPUT_GET_CLASS(dev); VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOInput *vinput = VIRTIO_INPUT(dev); Error *local_err = NULL; if (vic->unrealize) { @@ -289,8 +290,8 @@ static void virtio_input_device_unrealize(DeviceState *dev, Error **errp) return; } } - virtio_del_queue(vdev, 0); - virtio_del_queue(vdev, 1); + virtio_delete_queue(vinput->evt); + virtio_delete_queue(vinput->sts); virtio_cleanup(vdev); } From 8fdee7118defeb34441d860047e3e0cfd9da9c26 Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Thu, 5 Dec 2019 17:54:39 +0800 Subject: [PATCH 06/32] intel_iommu: fix bug to read DMAR_RTADDR_REG Should directly read DMAR_RTADDR_REG but not using 's->root'. Because 's->root' is modified in 'vtd_root_table_setup()' so that the first 12 bits are omitted. This causes the guest iommu debugfs cannot show pasid tables. Signed-off-by: Yi Sun Message-Id: <20191205095439.29114-1-yi.y.sun@linux.intel.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 43c94b993b..ee06993675 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2610,16 +2610,15 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) switch (addr) { /* Root Table Address Register, 64-bit */ case DMAR_RTADDR_REG: + val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); if (size == 4) { - val = s->root & ((1ULL << 32) - 1); - } else { - val = s->root; + val = val & ((1ULL << 32) - 1); } break; case DMAR_RTADDR_REG_HI: assert(size == 4); - val = s->root >> 32; + val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32; break; /* Invalidation Queue Address Register, 64-bit */ From d0c5f643383b9e84316f148affff368ac33d75b9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 13 Dec 2019 09:22:48 -0500 Subject: [PATCH 07/32] virtio: update queue size on guest write Some guests read back queue size after writing it. Update the size immediatly upon write otherwise they get confused. In particular this is the case for seabios. Reported-by: Roman Kagan Suggested-by: Denis Plotnikov Cc: qemu-stable@nongnu.org Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index c6b47a9c73..e5c759e19e 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1256,6 +1256,8 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, break; case VIRTIO_PCI_COMMON_Q_SIZE: proxy->vqs[vdev->queue_sel].num = val; + virtio_queue_set_num(vdev, vdev->queue_sel, + proxy->vqs[vdev->queue_sel].num); break; case VIRTIO_PCI_COMMON_Q_MSIX: msix_vector_unuse(&proxy->pci_dev, From 9d7bd0826f2d19f88631ad7078662668148f7b5f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Tue, 19 Nov 2019 18:50:03 -0600 Subject: [PATCH 08/32] virtio-pci: disable vring processing when bus-mastering is disabled Currently the SLOF firmware for pseries guests will disable/re-enable a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND register after the initial probe/feature negotiation, as it tends to work with a single device at a time at various stages like probing and running block/network bootloaders without doing a full reset in-between. In QEMU, when PCI_COMMAND_MASTER is disabled we disable the corresponding IOMMU memory region, so DMA accesses (including to vring fields like idx/flags) will no longer undergo the necessary translation. Normally we wouldn't expect this to happen since it would be misbehavior on the driver side to continue driving DMA requests. However, in the case of pseries, with iommu_platform=on, we trigger the following sequence when tearing down the virtio-blk dataplane ioeventfd in response to the guest unsetting PCI_COMMAND_MASTER: #2 0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757 #3 0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950 #4 0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0) at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255 #5 0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660) at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776 #6 0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550 #7 0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546 #8 0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527 #9 0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8) at /home/mdroth/w/qemu.git/util/aio-posix.c:520 #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0) at /home/mdroth/w/qemu.git/util/aio-posix.c:607 #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true) at /home/mdroth/w/qemu.git/util/aio-posix.c:639 #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 , opaque=opaque@entry=0x555556de86f0) at /home/mdroth/w/qemu.git/util/aio-wait.c:71 #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=) at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288 #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38) at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245 #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38) at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237 #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40) at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292 #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=, val=1048832, len=) at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613 I.e. the calling code is only scheduling a one-shot BH for virtio_blk_data_plane_stop_bh, but somehow we end up trying to process an additional virtqueue entry before we get there. This is likely due to the following check in virtio_queue_host_notifier_aio_poll: static bool virtio_queue_host_notifier_aio_poll(void *opaque) { EventNotifier *n = opaque; VirtQueue *vq = container_of(n, VirtQueue, host_notifier); bool progress; if (!vq->vring.desc || virtio_queue_empty(vq)) { return false; } progress = virtio_queue_notify_aio_vq(vq); namely the call to virtio_queue_empty(). In this case, since no new requests have actually been issued, shadow_avail_idx == last_avail_idx, so we actually try to access the vring via vring_avail_idx() to get the latest non-shadowed idx: int virtio_queue_empty(VirtQueue *vq) { bool empty; ... if (vq->shadow_avail_idx != vq->last_avail_idx) { return 0; } rcu_read_lock(); empty = vring_avail_idx(vq) == vq->last_avail_idx; rcu_read_unlock(); return empty; but since the IOMMU region has been disabled we get a bogus value (0 usually), which causes virtio_queue_empty() to falsely report that there are entries to be processed, which causes errors such as: "virtio: zero sized buffers are not allowed" or "virtio-blk missing headers" and puts the device in an error state. This patch works around the issue by introducing virtio_set_disabled(), which sets a 'disabled' flag to bypass checks like virtio_queue_empty() when bus-mastering is disabled. Since we'd check this flag at all the same sites as vdev->broken, we replace those checks with an inline function which checks for either vdev->broken or vdev->disabled. The 'disabled' flag is only migrated when set, which should be fairly rare, but to maintain migration compatibility we disable it's use for older machine types. Users requiring the use of the flag in conjunction with older machine types can set it explicitly as a virtio-device option. NOTES: - This leaves some other oddities in play, like the fact that DRIVER_OK also gets unset in response to bus-mastering being disabled, but not restored (however the device seems to continue working) - Similarly, we disable the host notifier via virtio_bus_stop_ioeventfd(), which seems to move the handling out of virtio-blk dataplane and back into the main IO thread, and it ends up staying there till a reset (but otherwise continues working normally) Cc: David Gibson , Cc: Alexey Kardashevskiy Cc: "Michael S. Tsirkin" Signed-off-by: Michael Roth Message-Id: <20191120005003.27035-1-mdroth@linux.vnet.ibm.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/core/machine.c | 1 + hw/virtio/virtio-pci.c | 12 ++++++++---- hw/virtio/virtio.c | 35 ++++++++++++++++++++++++++++------- include/hw/virtio/virtio.h | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 11 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 56137e9bf0..0854dcebdd 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -34,6 +34,7 @@ const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2); GlobalProperty hw_compat_4_1[] = { { "virtio-pci", "x-pcie-flr-init", "off" }, + { "virtio-device", "use-disabled-flag", "false" }, }; const size_t hw_compat_4_1_len = G_N_ELEMENTS(hw_compat_4_1); diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index e5c759e19e..f723b9f631 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address, pcie_cap_flr_write_config(pci_dev, address, val, len); } - if (range_covers_byte(address, len, PCI_COMMAND) && - !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { - virtio_pci_stop_ioeventfd(proxy); - virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK); + if (range_covers_byte(address, len, PCI_COMMAND)) { + if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { + virtio_set_disabled(vdev, true); + virtio_pci_stop_ioeventfd(proxy); + virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK); + } else { + virtio_set_disabled(vdev, false); + } } if (proxy->config_cap && diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 6de3cfdc2c..7bc6a9455e 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -546,7 +546,7 @@ static inline bool is_desc_avail(uint16_t flags, bool wrap_counter) * Called within rcu_read_lock(). */ static int virtio_queue_empty_rcu(VirtQueue *vq) { - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return 1; } @@ -565,7 +565,7 @@ static int virtio_queue_split_empty(VirtQueue *vq) { bool empty; - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return 1; } @@ -783,7 +783,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, virtqueue_unmap_sg(vq, elem, len); - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return; } @@ -839,7 +839,7 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count) void virtqueue_flush(VirtQueue *vq, unsigned int count) { - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { vq->inuse -= count; return; } @@ -1602,7 +1602,7 @@ err_undo_map: void *virtqueue_pop(VirtQueue *vq, size_t sz) { - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return NULL; } @@ -1698,7 +1698,7 @@ unsigned int virtqueue_drop_all(VirtQueue *vq) { struct VirtIODevice *vdev = vq->vdev; - if (unlikely(vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return 0; } @@ -1816,7 +1816,7 @@ static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector) BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - if (unlikely(vdev->broken)) { + if (virtio_device_disabled(vdev)) { return; } @@ -1920,6 +1920,7 @@ void virtio_reset(void *opaque) vdev->guest_features = 0; vdev->queue_sel = 0; vdev->status = 0; + vdev->disabled = false; atomic_set(&vdev->isr, 0); vdev->config_vector = VIRTIO_NO_VECTOR; virtio_notify_vector(vdev, vdev->config_vector); @@ -2559,6 +2560,13 @@ static bool virtio_started_needed(void *opaque) return vdev->started; } +static bool virtio_disabled_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return vdev->disabled; +} + static const VMStateDescription vmstate_virtqueue = { .name = "virtqueue_state", .version_id = 1, @@ -2724,6 +2732,17 @@ static const VMStateDescription vmstate_virtio_started = { } }; +static const VMStateDescription vmstate_virtio_disabled = { + .name = "virtio/disabled", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_disabled_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(disabled, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_virtio = { .name = "virtio", .version_id = 1, @@ -2741,6 +2760,7 @@ static const VMStateDescription vmstate_virtio = { &vmstate_virtio_extra_state, &vmstate_virtio_started, &vmstate_virtio_packed_virtqueues, + &vmstate_virtio_disabled, NULL } }; @@ -3575,6 +3595,7 @@ static void virtio_device_instance_finalize(Object *obj) static Property virtio_properties[] = { DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features), DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true), + DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index e18756d50d..777772475c 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -100,6 +100,8 @@ struct VirtIODevice uint16_t device_id; bool vm_running; bool broken; /* device in invalid state, needs reset */ + bool use_disabled_flag; /* allow use of 'disable' flag when needed */ + bool disabled; /* device in temporarily disabled state */ bool use_started; bool started; bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */ @@ -380,4 +382,17 @@ static inline void virtio_set_started(VirtIODevice *vdev, bool started) vdev->started = started; } } + +static inline void virtio_set_disabled(VirtIODevice *vdev, bool disable) +{ + if (vdev->use_disabled_flag) { + vdev->disabled = disable; + } +} + +static inline bool virtio_device_disabled(VirtIODevice *vdev) +{ + return unlikely(vdev->disabled || vdev->broken); +} + #endif From 6620801f39f0cd02b30ab5ad8729c92c78ce9fd3 Mon Sep 17 00:00:00 2001 From: Micky Yun Chan Date: Mon, 9 Dec 2019 09:53:31 +0800 Subject: [PATCH 09/32] Implement backend program convention command for vhost-user-blk This patch is to add standard commands defined in docs/interop/vhost-user.rst For vhost-user-* program Signed-off-by: Micky Yun Chan (michiboo) Message-Id: <20191209015331.5455-1-chanmickyyun@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/vhost-user-blk/vhost-user-blk.c | 108 ++++++++++++++---------- docs/interop/vhost-user.json | 31 +++++++ docs/interop/vhost-user.rst | 17 ++++ 3 files changed, 112 insertions(+), 44 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index ae61034656..6fd91c7e99 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -576,70 +576,90 @@ vub_new(char *blk_file) return vdev_blk; } +static int opt_fdnum = -1; +static char *opt_socket_path; +static char *opt_blk_file; +static gboolean opt_print_caps; +static gboolean opt_read_only; + +static GOptionEntry entries[] = { + { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps, + "Print capabilities", NULL }, + { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum, + "Use inherited fd socket", "FDNUM" }, + { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path, + "Use UNIX socket path", "PATH" }, + {"blk-file", 'b', 0, G_OPTION_ARG_FILENAME, &opt_blk_file, + "block device or file path", "PATH"}, + { "read-only", 'r', 0, G_OPTION_ARG_NONE, &opt_read_only, + "Enable read-only", NULL } +}; + int main(int argc, char **argv) { - int opt; - char *unix_socket = NULL; - char *blk_file = NULL; - bool enable_ro = false; int lsock = -1, csock = -1; VubDev *vdev_blk = NULL; + GError *error = NULL; + GOptionContext *context; - while ((opt = getopt(argc, argv, "b:rs:h")) != -1) { - switch (opt) { - case 'b': - blk_file = g_strdup(optarg); - break; - case 's': - unix_socket = g_strdup(optarg); - break; - case 'r': - enable_ro = true; - break; - case 'h': - default: - printf("Usage: %s [ -b block device or file, -s UNIX domain socket" - " | -r Enable read-only ] | [ -h ]\n", argv[0]); - return 0; + context = g_option_context_new(NULL); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_printerr("Option parsing failed: %s\n", error->message); + exit(EXIT_FAILURE); + } + if (opt_print_caps) { + g_print("{\n"); + g_print(" \"type\": \"block\",\n"); + g_print(" \"features\": [\n"); + g_print(" \"read-only\",\n"); + g_print(" \"blk-file\"\n"); + g_print(" ]\n"); + g_print("}\n"); + exit(EXIT_SUCCESS); + } + + if (!opt_blk_file) { + g_print("%s\n", g_option_context_get_help(context, true, NULL)); + exit(EXIT_FAILURE); + } + + if (opt_socket_path) { + lsock = unix_sock_new(opt_socket_path); + if (lsock < 0) { + exit(EXIT_FAILURE); } + } else if (opt_fdnum < 0) { + g_print("%s\n", g_option_context_get_help(context, true, NULL)); + exit(EXIT_FAILURE); + } else { + lsock = opt_fdnum; } - if (!unix_socket || !blk_file) { - printf("Usage: %s [ -b block device or file, -s UNIX domain socket" - " | -r Enable read-only ] | [ -h ]\n", argv[0]); - return -1; - } - - lsock = unix_sock_new(unix_socket); - if (lsock < 0) { - goto err; - } - - csock = accept(lsock, (void *)0, (void *)0); + csock = accept(lsock, NULL, NULL); if (csock < 0) { - fprintf(stderr, "Accept error %s\n", strerror(errno)); - goto err; + g_printerr("Accept error %s\n", strerror(errno)); + exit(EXIT_FAILURE); } - vdev_blk = vub_new(blk_file); + vdev_blk = vub_new(opt_blk_file); if (!vdev_blk) { - goto err; + exit(EXIT_FAILURE); } - if (enable_ro) { + if (opt_read_only) { vdev_blk->enable_ro = true; } if (!vug_init(&vdev_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock, vub_panic_cb, &vub_iface)) { - fprintf(stderr, "Failed to initialized libvhost-user-glib\n"); - goto err; + g_printerr("Failed to initialize libvhost-user-glib\n"); + exit(EXIT_FAILURE); } g_main_loop_run(vdev_blk->loop); - + g_main_loop_unref(vdev_blk->loop); + g_option_context_free(context); vug_deinit(&vdev_blk->parent); - -err: vub_free(vdev_blk); if (csock >= 0) { close(csock); @@ -647,8 +667,8 @@ err: if (lsock >= 0) { close(lsock); } - g_free(unix_socket); - g_free(blk_file); + g_free(opt_socket_path); + g_free(opt_blk_file); return 0; } diff --git a/docs/interop/vhost-user.json b/docs/interop/vhost-user.json index da6aaf51c8..ce0ef74db5 100644 --- a/docs/interop/vhost-user.json +++ b/docs/interop/vhost-user.json @@ -54,6 +54,37 @@ ] } +## +# @VHostUserBackendBlockFeature: +# +# List of vhost user "block" features. +# +# @read-only: The --read-only command line option is supported. +# @blk-file: The --blk-file command line option is supported. +# +# Since: 5.0 +## +{ + 'enum': 'VHostUserBackendBlockFeature', + 'data': [ 'read-only', 'blk-file' ] +} + +## +# @VHostUserBackendCapabilitiesBlock: +# +# Capabilities reported by vhost user "block" backends +# +# @features: list of supported features. +# +# Since: 5.0 +## +{ + 'struct': 'VHostUserBackendCapabilitiesBlock', + 'data': { + 'features': [ 'VHostUserBackendBlockFeature' ] + } +} + ## # @VHostUserBackendInputFeature: # diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 7827b710aa..015ac08177 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -1376,3 +1376,20 @@ Command line options: Enable virgl rendering support. (optional) + +vhost-user-blk +-------------- + +Command line options: + +--blk-file=PATH + + Specify block device or file path. + + (optional) + +--read-only + + Enable read-only. + + (optional) From d0435bc513e23a4961b6af20164d1c6c219eb4ea Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 9 Dec 2019 21:09:57 +0000 Subject: [PATCH 10/32] virtio: don't enable notifications during polling Virtqueue notifications are not necessary during polling, so we disable them. This allows the guest driver to avoid MMIO vmexits. Unfortunately the virtio-blk and virtio-scsi handler functions re-enable notifications, defeating this optimization. Fix virtio-blk and virtio-scsi emulation so they leave notifications disabled. The key thing to remember for correctness is that polling always checks one last time after ending its loop, therefore it's safe to lose the race when re-enabling notifications at the end of polling. There is a measurable performance improvement of 5-10% with the null-co block driver. Real-life storage configurations will see a smaller improvement because the MMIO vmexit overhead contributes less to latency. Signed-off-by: Stefan Hajnoczi Message-Id: <20191209210957.65087-1-stefanha@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/block/virtio-blk.c | 9 +++++++-- hw/scsi/virtio-scsi.c | 9 +++++++-- hw/virtio/virtio.c | 12 ++++++------ include/hw/virtio/virtio.h | 1 + 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index d62e6377c2..b12157b5eb 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -764,13 +764,16 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) { VirtIOBlockReq *req; MultiReqBuffer mrb = {}; + bool suppress_notifications = virtio_queue_get_notification(vq); bool progress = false; aio_context_acquire(blk_get_aio_context(s->blk)); blk_io_plug(s->blk); do { - virtio_queue_set_notification(vq, 0); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 0); + } while ((req = virtio_blk_get_request(s, vq))) { progress = true; @@ -781,7 +784,9 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) } } - virtio_queue_set_notification(vq, 1); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 1); + } } while (!virtio_queue_empty(vq)); if (mrb.num_reqs) { diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index e8b2b64d09..f080545f48 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -597,12 +597,15 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) { VirtIOSCSIReq *req, *next; int ret = 0; + bool suppress_notifications = virtio_queue_get_notification(vq); bool progress = false; QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs); do { - virtio_queue_set_notification(vq, 0); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 0); + } while ((req = virtio_scsi_pop_req(s, vq))) { progress = true; @@ -622,7 +625,9 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) } } - virtio_queue_set_notification(vq, 1); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 1); + } } while (ret != -EINVAL && !virtio_queue_empty(vq)); QTAILQ_FOREACH_SAFE(req, &reqs, next, next) { diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 7bc6a9455e..95d8ff8508 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -432,6 +432,11 @@ static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable) } } +bool virtio_queue_get_notification(VirtQueue *vq) +{ + return vq->notification; +} + void virtio_queue_set_notification(VirtQueue *vq, int enable) { vq->notification = enable; @@ -3410,17 +3415,12 @@ static bool virtio_queue_host_notifier_aio_poll(void *opaque) { EventNotifier *n = opaque; VirtQueue *vq = container_of(n, VirtQueue, host_notifier); - bool progress; if (!vq->vring.desc || virtio_queue_empty(vq)) { return false; } - progress = virtio_queue_notify_aio_vq(vq); - - /* In case the handler function re-enabled notifications */ - virtio_queue_set_notification(vq, 0); - return progress; + return virtio_queue_notify_aio_vq(vq); } static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 777772475c..b69d517496 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -228,6 +228,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id); void virtio_notify_config(VirtIODevice *vdev); +bool virtio_queue_get_notification(VirtQueue *vq); void virtio_queue_set_notification(VirtQueue *vq, int enable); int virtio_queue_ready(VirtQueue *vq); From 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Fri, 13 Dec 2019 09:19:22 +0800 Subject: [PATCH 11/32] numa: Extend CLI to provide initiator information for numa nodes In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), The initiator represents processor which access to memory. And in 5.2.27.3 Memory Proximity Domain Attributes Structure, the attached initiator is defined as where the memory controller responsible for a memory proximity domain. With attached initiator information, the topology of heterogeneous memory can be described. Add new machine property 'hmat' to enable all HMAT specific options. Extend CLI of "-numa node" option to indicate the initiator numa node-id. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Before using initiator option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Jingqi Liu Suggested-by: Dan Williams Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-2-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/core/machine.c | 64 +++++++++++++++++++++++++++++++++++++++++++ hw/core/numa.c | 23 ++++++++++++++++ include/sysemu/numa.h | 5 ++++ qapi/machine.json | 10 ++++++- qemu-options.hx | 35 +++++++++++++++++++---- 5 files changed, 131 insertions(+), 6 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 0854dcebdd..f5e2b32b3b 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -430,6 +430,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) ms->nvdimms_state->is_enabled = value; } +static bool machine_get_hmat(Object *obj, Error **errp) +{ + MachineState *ms = MACHINE(obj); + + return ms->numa_state->hmat_enabled; +} + +static void machine_set_hmat(Object *obj, bool value, Error **errp) +{ + MachineState *ms = MACHINE(obj); + + ms->numa_state->hmat_enabled = value; +} + static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) { MachineState *ms = MACHINE(obj); @@ -557,6 +571,7 @@ void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, Error **errp) { MachineClass *mc = MACHINE_GET_CLASS(machine); + NodeInfo *numa_info = machine->numa_state->nodes; bool match = false; int i; @@ -626,6 +641,17 @@ void machine_set_cpu_numa_node(MachineState *machine, match = true; slot->props.node_id = props->node_id; slot->props.has_node_id = props->has_node_id; + + if (machine->numa_state->hmat_enabled) { + if ((numa_info[props->node_id].initiator < MAX_NODES) && + (props->node_id != numa_info[props->node_id].initiator)) { + error_setg(errp, "The initiator of CPU NUMA node %" PRId64 + " should be itself", props->node_id); + return; + } + numa_info[props->node_id].has_cpu = true; + numa_info[props->node_id].initiator = props->node_id; + } } if (!match) { @@ -846,6 +872,13 @@ static void machine_initfn(Object *obj) if (mc->numa_mem_supported) { ms->numa_state = g_new0(NumaState, 1); + object_property_add_bool(obj, "hmat", + machine_get_hmat, machine_set_hmat, + &error_abort); + object_property_set_description(obj, "hmat", + "Set on/off to enable/disable " + "ACPI Heterogeneous Memory Attribute " + "Table (HMAT)", NULL); } /* Register notifier when init is done for sysbus sanity checks */ @@ -913,6 +946,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) return g_string_free(s, false); } +static void numa_validate_initiator(NumaState *numa_state) +{ + int i; + NodeInfo *numa_info = numa_state->nodes; + + for (i = 0; i < numa_state->num_nodes; i++) { + if (numa_info[i].initiator == MAX_NODES) { + error_report("The initiator of NUMA node %d is missing, use " + "'-numa node,initiator' option to declare it", i); + exit(1); + } + + if (!numa_info[numa_info[i].initiator].present) { + error_report("NUMA node %" PRIu16 " is missing, use " + "'-numa node' option to declare it first", + numa_info[i].initiator); + exit(1); + } + + if (!numa_info[numa_info[i].initiator].has_cpu) { + error_report("The initiator of NUMA node %d is invalid", i); + exit(1); + } + } +} + static void machine_numa_finish_cpu_init(MachineState *machine) { int i; @@ -953,6 +1012,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine) machine_set_cpu_numa_node(machine, &props, &error_fatal); } } + + if (machine->numa_state->hmat_enabled) { + numa_validate_initiator(machine->numa_state); + } + if (s->len && !qtest_enabled()) { warn_report("CPU(s) not present in any NUMA nodes: %s", s->str); diff --git a/hw/core/numa.c b/hw/core/numa.c index e3332a984f..e60da99293 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -133,6 +133,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL); numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); } + + /* + * If not set the initiator, set it to MAX_NODES. And if + * HMAT is enabled and this node has no cpus, QEMU will raise error. + */ + numa_info[nodenr].initiator = MAX_NODES; + if (node->has_initiator) { + if (!ms->numa_state->hmat_enabled) { + error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " + "(HMAT) is disabled, enable it with -machine hmat=on " + "before using any of hmat specific options"); + return; + } + + if (node->initiator >= MAX_NODES) { + error_report("The initiator id %" PRIu16 " expects an integer " + "between 0 and %d", node->initiator, + MAX_NODES - 1); + return; + } + + numa_info[nodenr].initiator = node->initiator; + } numa_info[nodenr].present = true; max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); ms->numa_state->num_nodes++; diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index ae9c41d02b..788cbec7a2 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -18,6 +18,8 @@ struct NodeInfo { uint64_t node_mem; struct HostMemoryBackend *node_memdev; bool present; + bool has_cpu; + uint16_t initiator; uint8_t distance[MAX_NODES]; }; @@ -33,6 +35,9 @@ struct NumaState { /* Allow setting NUMA distance for different NUMA nodes */ bool have_numa_distance; + /* Detect if HMAT support is enabled. */ + bool hmat_enabled; + /* NUMA nodes information */ NodeInfo nodes[MAX_NODES]; }; diff --git a/qapi/machine.json b/qapi/machine.json index ca26779f1a..27d0e37534 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -463,6 +463,13 @@ # @memdev: memory backend object. If specified for one node, # it must be specified for all nodes. # +# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, +# points to the nodeid which has the memory controller +# responsible for this NUMA node. This field provides +# additional information as to the initiator node that +# is closest (as in directly attached) to this node, and +# therefore has the best performance (since 5.0) +# # Since: 2.1 ## { 'struct': 'NumaNodeOptions', @@ -470,7 +477,8 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem': 'size', - '*memdev': 'str' }} + '*memdev': 'str', + '*initiator': 'uint16' }} ## # @NumaDistOptions: diff --git a/qemu-options.hx b/qemu-options.hx index e9d6231438..b78bc52634 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -40,7 +40,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ " suppress-vmdesc=on|off disables self-describing migration (default=off)\n" " nvdimm=on|off controls NVDIMM support (default=off)\n" " enforce-config-section=on|off enforce configuration section migration (default=off)\n" - " memory-encryption=@var{} memory encryption object to use (default=none)\n", + " memory-encryption=@var{} memory encryption object to use (default=none)\n" + " hmat=on|off controls ACPI HMAT support (default=off)\n", QEMU_ARCH_ALL) STEXI @item -machine [type=]@var{name}[,prop=@var{value}[,...]] @@ -94,6 +95,9 @@ NOTE: this parameter is deprecated. Please use @option{-global} @option{migration.send-configuration}=@var{on|off} instead. @item memory-encryption=@var{} Memory encryption object to use. The default is none. +@item hmat=on|off +Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support. +The default is off. @end table ETEXI @@ -168,14 +172,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, - "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" - "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" + "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" "-numa dist,src=source,dst=destination,val=distance\n" "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", QEMU_ARCH_ALL) STEXI -@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] -@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] +@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] +@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] @findex -numa @@ -222,6 +226,27 @@ split equally between them. @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore, if one node uses @samp{memdev}, all of them have to use it. +@samp{initiator} is an additional option that points to an @var{initiator} +NUMA node that has best performance (the lowest latency or largest bandwidth) +to this NUMA @var{node}. Note that this option can be set only when +the machine property 'hmat' is set to 'on'. + +Following example creates a machine with 2 NUMA nodes, node 0 has CPU. +node 1 has only memory, and its initiator is node 0. Note that because +node 0 has CPU, by default the initiator of node 0 is itself and must be +itself. +@example +-machine hmat=on \ +-m 2G,slots=2,maxmem=4G \ +-object memory-backend-ram,size=1G,id=m0 \ +-object memory-backend-ram,size=1G,id=m1 \ +-numa node,nodeid=0,memdev=m0 \ +-numa node,nodeid=1,memdev=m1,initiator=0 \ +-smp 2,sockets=2,maxcpus=2 \ +-numa cpu,node-id=0,socket-id=0 \ +-numa cpu,node-id=0,socket-id=1 +@end example + @var{source} and @var{destination} are NUMA node IDs. @var{distance} is the NUMA distance from @var{source} to @var{destination}. The distance from a node to itself is always 10. If any pair of nodes is From 9b12dfa03a94d7f7a4b54eb67229a31e58193384 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Fri, 13 Dec 2019 09:19:23 +0800 Subject: [PATCH 12/32] numa: Extend CLI to provide memory latency and bandwidth information Add -numa hmat-lb option to provide System Locality Latency and Bandwidth Information. These memory attributes help to build System Locality Latency and Bandwidth Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-lb option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-3-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- hw/core/numa.c | 194 ++++++++++++++++++++++++++++++++++++++++++ include/sysemu/numa.h | 53 ++++++++++++ qapi/machine.json | 93 +++++++++++++++++++- qemu-options.hx | 47 +++++++++- 4 files changed, 384 insertions(+), 3 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index e60da99293..34eb413f5d 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/hostmem.h" #include "sysemu/numa.h" #include "sysemu/sysemu.h" @@ -198,6 +199,186 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) ms->numa_state->have_numa_distance = true; } +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp) +{ + int i, first_bit, last_bit; + uint64_t max_entry, temp_base, bitmap_copy; + NodeInfo *numa_info = numa_state->nodes; + HMAT_LB_Info *hmat_lb = + numa_state->hmat_lb[node->hierarchy][node->data_type]; + HMAT_LB_Data lb_data = {}; + HMAT_LB_Data *lb_temp; + + /* Error checking */ + if (node->initiator > numa_state->num_nodes) { + error_setg(errp, "Invalid initiator=%d, it should be less than %d", + node->initiator, numa_state->num_nodes); + return; + } + if (node->target > numa_state->num_nodes) { + error_setg(errp, "Invalid target=%d, it should be less than %d", + node->target, numa_state->num_nodes); + return; + } + if (!numa_info[node->initiator].has_cpu) { + error_setg(errp, "Invalid initiator=%d, it isn't an " + "initiator proximity domain", node->initiator); + return; + } + if (!numa_info[node->target].present) { + error_setg(errp, "The target=%d should point to an existing node", + node->target); + return; + } + + if (!hmat_lb) { + hmat_lb = g_malloc0(sizeof(*hmat_lb)); + numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; + hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); + } + hmat_lb->hierarchy = node->hierarchy; + hmat_lb->data_type = node->data_type; + lb_data.initiator = node->initiator; + lb_data.target = node->target; + + if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { + /* Input latency data */ + + if (!node->has_latency) { + error_setg(errp, "Missing 'latency' option"); + return; + } + if (node->has_bandwidth) { + error_setg(errp, "Invalid option 'bandwidth' since " + "the data type is latency"); + return; + } + + /* Detect duplicate configuration */ + for (i = 0; i < hmat_lb->list->len; i++) { + lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + + if (node->initiator == lb_temp->initiator && + node->target == lb_temp->target) { + error_setg(errp, "Duplicate configuration of the latency for " + "initiator=%d and target=%d", node->initiator, + node->target); + return; + } + } + + hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; + + if (node->latency) { + /* Calculate the temporary base and compressed latency */ + max_entry = node->latency; + temp_base = 1; + while (QEMU_IS_ALIGNED(max_entry, 10)) { + max_entry /= 10; + temp_base *= 10; + } + + /* Calculate the max compressed latency */ + temp_base = MIN(hmat_lb->base, temp_base); + max_entry = node->latency / hmat_lb->base; + max_entry = MAX(hmat_lb->range_bitmap, max_entry); + + /* + * For latency hmat_lb->range_bitmap record the max compressed + * latency which should be less than 0xFFFF (UINT16_MAX) + */ + if (max_entry >= UINT16_MAX) { + error_setg(errp, "Latency %" PRIu64 " between initiator=%d and " + "target=%d should not differ from previously entered " + "min or max values on more than %d", node->latency, + node->initiator, node->target, UINT16_MAX - 1); + return; + } else { + hmat_lb->base = temp_base; + hmat_lb->range_bitmap = max_entry; + } + + /* + * Set lb_info_provided bit 0 as 1, + * latency information is provided + */ + numa_info[node->target].lb_info_provided |= BIT(0); + } + lb_data.data = node->latency; + } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) { + /* Input bandwidth data */ + if (!node->has_bandwidth) { + error_setg(errp, "Missing 'bandwidth' option"); + return; + } + if (node->has_latency) { + error_setg(errp, "Invalid option 'latency' since " + "the data type is bandwidth"); + return; + } + if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) { + error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and " + "target=%d should be 1MB aligned", node->bandwidth, + node->initiator, node->target); + return; + } + + /* Detect duplicate configuration */ + for (i = 0; i < hmat_lb->list->len; i++) { + lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + + if (node->initiator == lb_temp->initiator && + node->target == lb_temp->target) { + error_setg(errp, "Duplicate configuration of the bandwidth for " + "initiator=%d and target=%d", node->initiator, + node->target); + return; + } + } + + hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1; + + if (node->bandwidth) { + /* Keep bitmap unchanged when bandwidth out of range */ + bitmap_copy = hmat_lb->range_bitmap; + bitmap_copy |= node->bandwidth; + first_bit = ctz64(bitmap_copy); + temp_base = UINT64_C(1) << first_bit; + max_entry = node->bandwidth / temp_base; + last_bit = 64 - clz64(bitmap_copy); + + /* + * For bandwidth, first_bit record the base unit of bandwidth bits, + * last_bit record the last bit of the max bandwidth. The max + * compressed bandwidth should be less than 0xFFFF (UINT16_MAX) + */ + if ((last_bit - first_bit) > UINT16_BITS || + max_entry >= UINT16_MAX) { + error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d " + "and target=%d should not differ from previously " + "entered values on more than %d", node->bandwidth, + node->initiator, node->target, UINT16_MAX - 1); + return; + } else { + hmat_lb->base = temp_base; + hmat_lb->range_bitmap = bitmap_copy; + } + + /* + * Set lb_info_provided bit 1 as 1, + * bandwidth information is provided + */ + numa_info[node->target].lb_info_provided |= BIT(1); + } + lb_data.data = node->bandwidth; + } else { + assert(0); + } + + g_array_append_val(hmat_lb->list, lb_data); +} + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) { Error *err = NULL; @@ -236,6 +417,19 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), &err); break; + case NUMA_OPTIONS_TYPE_HMAT_LB: + if (!ms->numa_state->hmat_enabled) { + error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " + "(HMAT) is disabled, enable it with -machine hmat=on " + "before using any of hmat specific options"); + return; + } + + parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err); + if (err) { + goto end; + } + break; default: abort(); } diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 788cbec7a2..70f93c83d7 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -14,11 +14,34 @@ struct CPUArchId; #define NUMA_DISTANCE_MAX 254 #define NUMA_DISTANCE_UNREACHABLE 255 +/* the value of AcpiHmatLBInfo flags */ +enum { + HMAT_LB_MEM_MEMORY = 0, + HMAT_LB_MEM_CACHE_1ST_LEVEL = 1, + HMAT_LB_MEM_CACHE_2ND_LEVEL = 2, + HMAT_LB_MEM_CACHE_3RD_LEVEL = 3, + HMAT_LB_LEVELS /* must be the last entry */ +}; + +/* the value of AcpiHmatLBInfo data type */ +enum { + HMAT_LB_DATA_ACCESS_LATENCY = 0, + HMAT_LB_DATA_READ_LATENCY = 1, + HMAT_LB_DATA_WRITE_LATENCY = 2, + HMAT_LB_DATA_ACCESS_BANDWIDTH = 3, + HMAT_LB_DATA_READ_BANDWIDTH = 4, + HMAT_LB_DATA_WRITE_BANDWIDTH = 5, + HMAT_LB_TYPES /* must be the last entry */ +}; + +#define UINT16_BITS 16 + struct NodeInfo { uint64_t node_mem; struct HostMemoryBackend *node_memdev; bool present; bool has_cpu; + uint8_t lb_info_provided; uint16_t initiator; uint8_t distance[MAX_NODES]; }; @@ -28,6 +51,31 @@ struct NumaNodeMem { uint64_t node_plugged_mem; }; +struct HMAT_LB_Data { + uint8_t initiator; + uint8_t target; + uint64_t data; +}; +typedef struct HMAT_LB_Data HMAT_LB_Data; + +struct HMAT_LB_Info { + /* Indicates it's memory or the specified level memory side cache. */ + uint8_t hierarchy; + + /* Present the type of data, access/read/write latency or bandwidth. */ + uint8_t data_type; + + /* The range bitmap of bandwidth for calculating common base */ + uint64_t range_bitmap; + + /* The common base unit for latencies or bandwidths */ + uint64_t base; + + /* Array to store the latencies or bandwidths */ + GArray *list; +}; +typedef struct HMAT_LB_Info HMAT_LB_Info; + struct NumaState { /* Number of NUMA nodes */ int num_nodes; @@ -40,11 +88,16 @@ struct NumaState { /* NUMA nodes information */ NodeInfo nodes[MAX_NODES]; + + /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ + HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; }; typedef struct NumaState NumaState; void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); void parse_numa_opts(MachineState *ms); +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp); void numa_complete_configuration(MachineState *ms); void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); extern QemuOptsList qemu_numa_opts; diff --git a/qapi/machine.json b/qapi/machine.json index 27d0e37534..cf8faf5a2a 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -426,10 +426,12 @@ # # @cpu: property based CPU(s) to node mapping (Since: 2.10) # +# @hmat-lb: memory latency and bandwidth information (Since: 5.0) +# # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node', 'dist', 'cpu' ] } + 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } ## # @NumaOptions: @@ -444,7 +446,8 @@ 'data': { 'node': 'NumaNodeOptions', 'dist': 'NumaDistOptions', - 'cpu': 'NumaCpuOptions' }} + 'cpu': 'NumaCpuOptions', + 'hmat-lb': 'NumaHmatLBOptions' }} ## # @NumaNodeOptions: @@ -557,6 +560,92 @@ 'base': 'CpuInstanceProperties', 'data' : {} } +## +# @HmatLBMemoryHierarchy: +# +# The memory hierarchy in the System Locality Latency and Bandwidth +# Information Structure of HMAT (Heterogeneous Memory Attribute Table) +# +# For more information about @HmatLBMemoryHierarchy, see chapter +# 5.2.27.4: Table 5-146: Field "Flags" of ACPI 6.3 spec. +# +# @memory: the structure represents the memory performance +# +# @first-level: first level of memory side cache +# +# @second-level: second level of memory side cache +# +# @third-level: third level of memory side cache +# +# Since: 5.0 +## +{ 'enum': 'HmatLBMemoryHierarchy', + 'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] } + +## +# @HmatLBDataType: +# +# Data type in the System Locality Latency and Bandwidth +# Information Structure of HMAT (Heterogeneous Memory Attribute Table) +# +# For more information about @HmatLBDataType, see chapter +# 5.2.27.4: Table 5-146: Field "Data Type" of ACPI 6.3 spec. +# +# @access-latency: access latency (nanoseconds) +# +# @read-latency: read latency (nanoseconds) +# +# @write-latency: write latency (nanoseconds) +# +# @access-bandwidth: access bandwidth (Bytes per second) +# +# @read-bandwidth: read bandwidth (Bytes per second) +# +# @write-bandwidth: write bandwidth (Bytes per second) +# +# Since: 5.0 +## +{ 'enum': 'HmatLBDataType', + 'data': [ 'access-latency', 'read-latency', 'write-latency', + 'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] } + +## +# @NumaHmatLBOptions: +# +# Set the system locality latency and bandwidth information +# between Initiator and Target proximity Domains. +# +# For more information about @NumaHmatLBOptions, see chapter +# 5.2.27.4: Table 5-146 of ACPI 6.3 spec. +# +# @initiator: the Initiator Proximity Domain. +# +# @target: the Target Proximity Domain. +# +# @hierarchy: the Memory Hierarchy. Indicates the performance +# of memory or side cache. +# +# @data-type: presents the type of data, access/read/write +# latency or hit latency. +# +# @latency: the value of latency from @initiator to @target +# proximity domain, the latency unit is "ns(nanosecond)". +# +# @bandwidth: the value of bandwidth between @initiator and @target +# proximity domain, the bandwidth unit is +# "Bytes per second". +# +# Since: 5.0 +## +{ 'struct': 'NumaHmatLBOptions', + 'data': { + 'initiator': 'uint16', + 'target': 'uint16', + 'hierarchy': 'HmatLBMemoryHierarchy', + 'data-type': 'HmatLBDataType', + '*latency': 'uint64', + '*bandwidth': 'size' }} + ## # @HostMemPolicy: # diff --git a/qemu-options.hx b/qemu-options.hx index b78bc52634..a0c0bbb7cf 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -175,16 +175,19 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" "-numa dist,src=source,dst=destination,val=distance\n" - "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" + "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n", QEMU_ARCH_ALL) STEXI @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] +@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}] @findex -numa Define a NUMA node and assign RAM and VCPUs to it. Set the NUMA distance from a source node to a destination node. +Set the ACPI Heterogeneous Memory Attributes for the given nodes. Legacy VCPU assignment uses @samp{cpus} option where @var{firstcpu} and @var{lastcpu} are CPU indexes. Each @@ -263,6 +266,48 @@ specified resources, it just assigns existing resources to NUMA nodes. This means that one still has to use the @option{-m}, @option{-smp} options to allocate RAM and VCPUs respectively. +Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information +between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT). +Initiator NUMA node can create memory requests, usually it has one or more processors. +Target NUMA node contains addressable memory. + +In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{hierarchy} is the memory +hierarchy of the target NUMA node: if @var{hierarchy} is 'memory', the structure +represents the memory performance; if @var{hierarchy} is 'first-level|second-level|third-level', +this structure represents aggregated performance of memory side caches for each domain. +@var{type} of 'data-type' is type of data represented by this structure instance: +if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency or 'access|read|write' +bandwidth of the target memory; if 'hierarchy' is 'first-level|second-level|third-level', +'data-type' is 'access|read|write' hit latency or 'access|read|write' hit bandwidth of the +target memory side cache. + +@var{lat} is latency value in nanoseconds. @var{bw} is bandwidth value, +the possible value and units are NUM[M|G|T], mean that the bandwidth value are +NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix). +Note that if latency or bandwidth value is 0, means the corresponding latency or +bandwidth information is not provided. + +For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and +a ram, node 1 has only a ram. The processors in node 0 access memory in node +0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s; +The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10 +nanoseconds, access-bandwidth is 100 MB/s. +@example +-machine hmat=on \ +-m 2G \ +-object memory-backend-ram,size=1G,id=m0 \ +-object memory-backend-ram,size=1G,id=m1 \ +-smp 2 \ +-numa node,nodeid=0,memdev=m0 \ +-numa node,nodeid=1,memdev=m1,initiator=0 \ +-numa cpu,node-id=0,socket-id=0 \ +-numa cpu,node-id=0,socket-id=1 \ +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \ +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \ +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \ +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M +@end example + ETEXI DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd, From c412a48d4d91e8f8b89aae02de0f44f1f0b729e5 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Fri, 13 Dec 2019 09:19:24 +0800 Subject: [PATCH 13/32] numa: Extend CLI to provide memory side cache information Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-cache option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-4-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- hw/core/numa.c | 80 ++++++++++++++++++++++++++++++++++++++++++ include/sysemu/numa.h | 5 +++ qapi/machine.json | 81 +++++++++++++++++++++++++++++++++++++++++-- qemu-options.hx | 17 +++++++-- 4 files changed, 179 insertions(+), 4 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index 34eb413f5d..747c9680b0 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -379,6 +379,73 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, g_array_append_val(hmat_lb->list, lb_data); } +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp) +{ + int nb_numa_nodes = ms->numa_state->num_nodes; + NodeInfo *numa_info = ms->numa_state->nodes; + NumaHmatCacheOptions *hmat_cache = NULL; + + if (node->node_id >= nb_numa_nodes) { + error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " + "than %d", node->node_id, nb_numa_nodes); + return; + } + + if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { + error_setg(errp, "The latency and bandwidth information of " + "node-id=%" PRIu32 " should be provided before memory side " + "cache attributes", node->node_id); + return; + } + + if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { + error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " + "and less than or equal to %d", node->level, + HMAT_LB_LEVELS - 1); + return; + } + + assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); + assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); + if (ms->numa_state->hmat_cache[node->node_id][node->level]) { + error_setg(errp, "Duplicate configuration of the side cache for " + "node-id=%" PRIu32 " and level=%" PRIu8, + node->node_id, node->level); + return; + } + + if ((node->level > 1) && + ms->numa_state->hmat_cache[node->node_id][node->level - 1] && + (node->size >= + ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { + error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be less than the size(%" PRIu64 ") of " + "level=%u", node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level - 1]->size, + node->level - 1); + return; + } + + if ((node->level < HMAT_LB_LEVELS - 1) && + ms->numa_state->hmat_cache[node->node_id][node->level + 1] && + (node->size <= + ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { + error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be larger than the size(%" PRIu64 ") of " + "level=%u", node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level + 1]->size, + node->level + 1); + return; + } + + hmat_cache = g_malloc0(sizeof(*hmat_cache)); + memcpy(hmat_cache, node, sizeof(*hmat_cache)); + ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; +} + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) { Error *err = NULL; @@ -430,6 +497,19 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) goto end; } break; + case NUMA_OPTIONS_TYPE_HMAT_CACHE: + if (!ms->numa_state->hmat_enabled) { + error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " + "(HMAT) is disabled, enable it with -machine hmat=on " + "before using any of hmat specific options"); + return; + } + + parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err); + if (err) { + goto end; + } + break; default: abort(); } diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 70f93c83d7..ba693cc80b 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -91,6 +91,9 @@ struct NumaState { /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; + + /* Memory Side Cache Information Structure */ + NumaHmatCacheOptions *hmat_cache[MAX_NODES][HMAT_LB_LEVELS]; }; typedef struct NumaState NumaState; @@ -98,6 +101,8 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); void parse_numa_opts(MachineState *ms); void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, Error **errp); +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp); void numa_complete_configuration(MachineState *ms); void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); extern QemuOptsList qemu_numa_opts; diff --git a/qapi/machine.json b/qapi/machine.json index cf8faf5a2a..b3d30bc816 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -428,10 +428,12 @@ # # @hmat-lb: memory latency and bandwidth information (Since: 5.0) # +# @hmat-cache: memory side cache information (Since: 5.0) +# # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } + 'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] } ## # @NumaOptions: @@ -447,7 +449,8 @@ 'node': 'NumaNodeOptions', 'dist': 'NumaDistOptions', 'cpu': 'NumaCpuOptions', - 'hmat-lb': 'NumaHmatLBOptions' }} + 'hmat-lb': 'NumaHmatLBOptions', + 'hmat-cache': 'NumaHmatCacheOptions' }} ## # @NumaNodeOptions: @@ -646,6 +649,80 @@ '*latency': 'uint64', '*bandwidth': 'size' }} +## +# @HmatCacheAssociativity: +# +# Cache associativity in the Memory Side Cache Information Structure +# of HMAT +# +# For more information of @HmatCacheAssociativity, see chapter +# 5.2.27.5: Table 5-147 of ACPI 6.3 spec. +# +# @none: None (no memory side cache in this proximity domain, +# or cache associativity unknown) +# +# @direct: Direct Mapped +# +# @complex: Complex Cache Indexing (implementation specific) +# +# Since: 5.0 +## +{ 'enum': 'HmatCacheAssociativity', + 'data': [ 'none', 'direct', 'complex' ] } + +## +# @HmatCacheWritePolicy: +# +# Cache write policy in the Memory Side Cache Information Structure +# of HMAT +# +# For more information of @HmatCacheWritePolicy, see chapter +# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. +# +# @none: None (no memory side cache in this proximity domain, +# or cache write policy unknown) +# +# @write-back: Write Back (WB) +# +# @write-through: Write Through (WT) +# +# Since: 5.0 +## +{ 'enum': 'HmatCacheWritePolicy', + 'data': [ 'none', 'write-back', 'write-through' ] } + +## +# @NumaHmatCacheOptions: +# +# Set the memory side cache information for a given memory domain. +# +# For more information of @NumaHmatCacheOptions, see chapter +# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. +# +# @node-id: the memory proximity domain to which the memory belongs. +# +# @size: the size of memory side cache in bytes. +# +# @level: the cache level described in this structure. +# +# @associativity: the cache associativity, +# none/direct-mapped/complex(complex cache indexing). +# +# @policy: the write policy, none/write-back/write-through. +# +# @line: the cache Line size in bytes. +# +# Since: 5.0 +## +{ 'struct': 'NumaHmatCacheOptions', + 'data': { + 'node-id': 'uint32', + 'size': 'size', + 'level': 'uint8', + 'associativity': 'HmatCacheAssociativity', + 'policy': 'HmatCacheWritePolicy', + 'line': 'uint16' }} + ## # @HostMemPolicy: # diff --git a/qemu-options.hx b/qemu-options.hx index a0c0bbb7cf..d4b73ef60c 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -176,7 +176,8 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" "-numa dist,src=source,dst=destination,val=distance\n" "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" - "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n", + "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n" + "-numa hmat-cache,node-id=node,size=size,level=level[,associativity=none|direct|complex][,policy=none|write-back|write-through][,line=size]\n", QEMU_ARCH_ALL) STEXI @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] @@ -184,6 +185,7 @@ STEXI @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] @itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}] +@itemx -numa hmat-cache,node-id=@var{node},size=@var{size},level=@var{level}[,associativity=@var{str}][,policy=@var{str}][,line=@var{size}] @findex -numa Define a NUMA node and assign RAM and VCPUs to it. Set the NUMA distance from a source node to a destination node. @@ -287,11 +289,20 @@ NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix). Note that if latency or bandwidth value is 0, means the corresponding latency or bandwidth information is not provided. +In @samp{hmat-cache} option, @var{node-id} is the NUMA-id of the memory belongs. +@var{size} is the size of memory side cache in bytes. @var{level} is the cache +level described in this structure, note that the cache level 0 should not be used +with @samp{hmat-cache} option. @var{associativity} is the cache associativity, +the possible value is 'none/direct(direct-mapped)/complex(complex cache indexing)'. +@var{policy} is the write policy. @var{line} is the cache Line size in bytes. + For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and a ram, node 1 has only a ram. The processors in node 0 access memory in node 0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s; The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10 nanoseconds, access-bandwidth is 100 MB/s. +And for memory side cache information, NUMA node 0 and 1 both have 1 level memory +cache, size is 10KB, policy is write-back, the cache Line size is 8 bytes: @example -machine hmat=on \ -m 2G \ @@ -305,7 +316,9 @@ nanoseconds, access-bandwidth is 100 MB/s. -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \ -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \ -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \ --numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M \ +-numa hmat-cache,node-id=0,size=10K,level=1,associativity=direct,policy=write-back,line=8 \ +-numa hmat-cache,node-id=1,size=10K,level=1,associativity=direct,policy=write-back,line=8 @end example ETEXI From e6f123c3b81241be33f1b763d0ff8b36d1ae9c1e Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Fri, 13 Dec 2019 09:19:25 +0800 Subject: [PATCH 14/32] hmat acpi: Build Memory Proximity Domain Attributes Structure(s) HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table (HMAT). The specification references below link: http://www.uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf It describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use this information as hint for optimization. This structure describes Memory Proximity Domain Attributes by memory subsystem and its associativity with processor proximity domain as well as hint for memory usage. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-5-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/Kconfig | 7 ++- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c | 99 +++++++++++++++++++++++++++++++++++++++++++ hw/acpi/hmat.h | 42 ++++++++++++++++++ hw/i386/acpi-build.c | 5 +++ 5 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 12e3f1e86e..54209c6f2f 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -7,6 +7,7 @@ config ACPI_X86 select ACPI_NVDIMM select ACPI_CPU_HOTPLUG select ACPI_MEMORY_HOTPLUG + select ACPI_HMAT config ACPI_X86_ICH bool @@ -23,6 +24,10 @@ config ACPI_NVDIMM bool depends on ACPI +config ACPI_HMAT + bool + depends on ACPI + config ACPI_PCI bool depends on ACPI && PCI @@ -33,5 +38,3 @@ config ACPI_VMGENID depends on PC config ACPI_HW_REDUCED - bool - depends on ACPI diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs index 99253057e1..777da07f4d 100644 --- a/hw/acpi/Makefile.objs +++ b/hw/acpi/Makefile.objs @@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o common-obj-$(call lnot,$(CONFIG_PC)) += acpi-x86-stub.o diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c new file mode 100644 index 0000000000..9ff79308a4 --- /dev/null +++ b/hw/acpi/hmat.c @@ -0,0 +1,99 @@ +/* + * HMAT ACPI Implementation + * + * Copyright(C) 2019 Intel Corporation. + * + * Author: + * Liu jingqi + * Tao Xu + * + * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table + * (HMAT) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#include "qemu/osdep.h" +#include "sysemu/numa.h" +#include "hw/acpi/hmat.h" + +/* + * ACPI 6.3: + * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145 + */ +static void build_hmat_mpda(GArray *table_data, uint16_t flags, + uint32_t initiator, uint32_t mem_node) +{ + + /* Memory Proximity Domain Attributes Structure */ + /* Type */ + build_append_int_noprefix(table_data, 0, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Length */ + build_append_int_noprefix(table_data, 40, 4); + /* Flags */ + build_append_int_noprefix(table_data, flags, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Proximity Domain for the Attached Initiator */ + build_append_int_noprefix(table_data, initiator, 4); + /* Proximity Domain for the Memory */ + build_append_int_noprefix(table_data, mem_node, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + /* + * Reserved: + * Previously defined as the Start Address of the System Physical + * Address Range. Deprecated since ACPI Spec 6.3. + */ + build_append_int_noprefix(table_data, 0, 8); + /* + * Reserved: + * Previously defined as the Range Length of the region in bytes. + * Deprecated since ACPI Spec 6.3. + */ + build_append_int_noprefix(table_data, 0, 8); +} + +/* Build HMAT sub table structures */ +static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) +{ + uint16_t flags; + int i; + + for (i = 0; i < numa_state->num_nodes; i++) { + flags = 0; + + if (numa_state->nodes[i].initiator < MAX_NODES) { + flags |= HMAT_PROXIMITY_INITIATOR_VALID; + } + + build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); + } +} + +void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) +{ + int hmat_start = table_data->len; + + /* reserve space for HMAT header */ + acpi_data_push(table_data, 40); + + hmat_build_table_structs(table_data, numa_state); + + build_header(linker, table_data, + (void *)(table_data->data + hmat_start), + "HMAT", table_data->len - hmat_start, 2, NULL, NULL); +} diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h new file mode 100644 index 0000000000..437dbc6872 --- /dev/null +++ b/hw/acpi/hmat.h @@ -0,0 +1,42 @@ +/* + * HMAT ACPI Implementation Header + * + * Copyright(C) 2019 Intel Corporation. + * + * Author: + * Liu jingqi + * Tao Xu + * + * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table + * (HMAT) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#ifndef HMAT_H +#define HMAT_H + +#include "hw/acpi/aml-build.h" + +/* + * ACPI 6.3: 5.2.27.3 Memory Proximity Domain Attributes Structure, + * Table 5-145, Field "flag", Bit [0]: set to 1 to indicate that data in + * the Proximity Domain for the Attached Initiator field is valid. + * Other bits reserved. + */ +#define HMAT_PROXIMITY_INITIATOR_VALID 0x1 + +void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state); + +#endif diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 7b8da62d41..e25df838f0 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -68,6 +68,7 @@ #include "hw/i386/intel_iommu.h" #include "hw/acpi/ipmi.h" +#include "hw/acpi/hmat.h" /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and * -M pc-i440fx-2.0. Even if the actual amount of AML generated grows @@ -2835,6 +2836,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) acpi_add_table(table_offsets, tables_blob); build_slit(tables_blob, tables->linker, machine); } + if (machine->numa_state->hmat_enabled) { + acpi_add_table(table_offsets, tables_blob); + build_hmat(tables_blob, tables->linker, machine->numa_state); + } } if (acpi_get_mcfg(&mcfg)) { acpi_add_table(table_offsets, tables_blob); From 4586a2cb833f80b19c80ebe364a005ac2fa0974a Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Fri, 13 Dec 2019 09:19:26 +0800 Subject: [PATCH 15/32] hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) This structure describes the memory access latency and bandwidth information from various memory access initiator proximity domains. The latency and bandwidth numbers represented in this structure correspond to rated latency and bandwidth for the platform. The software could use this information as hint for optimization. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-6-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/hmat.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 9ff79308a4..4635d45dee 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -25,6 +25,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/numa.h" #include "hw/acpi/hmat.h" @@ -67,11 +68,89 @@ static void build_hmat_mpda(GArray *table_data, uint16_t flags, build_append_int_noprefix(table_data, 0, 8); } +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ +static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + uint32_t num_initiator, uint32_t num_target, + uint32_t *initiator_list) +{ + int i, index; + HMAT_LB_Data *lb_data; + uint16_t *entry_list; + uint32_t base; + /* Length in bytes for entire structure */ + uint32_t lb_length + = 32 /* Table length upto and including Entry Base Unit */ + + 4 * num_initiator /* Initiator Proximity Domain List */ + + 4 * num_target /* Target Proximity Domain List */ + + 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ + + /* Type */ + build_append_int_noprefix(table_data, 1, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Length */ + build_append_int_noprefix(table_data, lb_length, 4); + /* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ + assert(!(hmat_lb->hierarchy >> 4)); + build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); + /* Data Type */ + build_append_int_noprefix(table_data, hmat_lb->data_type, 1); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Number of Initiator Proximity Domains (s) */ + build_append_int_noprefix(table_data, num_initiator, 4); + /* Number of Target Proximity Domains (t) */ + build_append_int_noprefix(table_data, num_target, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + + /* Entry Base Unit */ + if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { + /* Convert latency base from nanoseconds to picosecond */ + base = hmat_lb->base * 1000; + } else { + /* Convert bandwidth base from Byte to Megabyte */ + base = hmat_lb->base / MiB; + } + build_append_int_noprefix(table_data, base, 8); + + /* Initiator Proximity Domain List */ + for (i = 0; i < num_initiator; i++) { + build_append_int_noprefix(table_data, initiator_list[i], 4); + } + + /* Target Proximity Domain List */ + for (i = 0; i < num_target; i++) { + build_append_int_noprefix(table_data, i, 4); + } + + /* Latency or Bandwidth Entries */ + entry_list = g_malloc0(num_initiator * num_target * sizeof(uint16_t)); + for (i = 0; i < hmat_lb->list->len; i++) { + lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + index = lb_data->initiator * num_target + lb_data->target; + + entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); + } + + for (i = 0; i < num_initiator * num_target; i++) { + build_append_int_noprefix(table_data, entry_list[i], 2); + } + + g_free(entry_list); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; - int i; + uint32_t num_initiator = 0; + uint32_t initiator_list[MAX_NODES]; + int i, hierarchy, type; + HMAT_LB_Info *hmat_lb; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -82,6 +161,29 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); } + + for (i = 0; i < numa_state->num_nodes; i++) { + if (numa_state->nodes[i].has_cpu) { + initiator_list[num_initiator++] = i; + } + } + + /* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ + for (hierarchy = HMAT_LB_MEM_MEMORY; + hierarchy <= HMAT_LB_MEM_CACHE_3RD_LEVEL; hierarchy++) { + for (type = HMAT_LB_DATA_ACCESS_LATENCY; + type <= HMAT_LB_DATA_WRITE_BANDWIDTH; type++) { + hmat_lb = numa_state->hmat_lb[hierarchy][type]; + + if (hmat_lb && hmat_lb->list->len) { + build_hmat_lb(table_data, hmat_lb, num_initiator, + numa_state->num_nodes, initiator_list); + } + } + } } void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) From a9c2b841af002db6e21e1297c9026b63fc22c875 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Fri, 13 Dec 2019 09:19:27 +0800 Subject: [PATCH 16/32] hmat acpi: Build Memory Side Cache Information Structure(s) This structure describes memory side cache information for memory proximity domains if the memory side cache is present and the physical device forms the memory side cache. The software could use this information to effectively place the data in memory to maximize the performance of the system memory that use the memory side cache. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-7-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/hmat.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 4635d45dee..7c24bb5371 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -143,14 +143,62 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, g_free(entry_list); } +/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */ +static void build_hmat_cache(GArray *table_data, uint8_t total_levels, + NumaHmatCacheOptions *hmat_cache) +{ + /* + * Cache Attributes: Bits [3:0] – Total Cache Levels + * for this Memory Proximity Domain + */ + uint32_t cache_attr = total_levels; + + /* Bits [7:4] : Cache Level described in this structure */ + cache_attr |= (uint32_t) hmat_cache->level << 4; + + /* Bits [11:8] - Cache Associativity */ + cache_attr |= (uint32_t) hmat_cache->associativity << 8; + + /* Bits [15:12] - Write Policy */ + cache_attr |= (uint32_t) hmat_cache->policy << 12; + + /* Bits [31:16] - Cache Line size in bytes */ + cache_attr |= (uint32_t) hmat_cache->line << 16; + + /* Type */ + build_append_int_noprefix(table_data, 2, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Length */ + build_append_int_noprefix(table_data, 32, 4); + /* Proximity Domain for the Memory */ + build_append_int_noprefix(table_data, hmat_cache->node_id, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + /* Memory Side Cache Size */ + build_append_int_noprefix(table_data, hmat_cache->size, 8); + /* Cache Attributes */ + build_append_int_noprefix(table_data, cache_attr, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* + * Number of SMBIOS handles (n) + * Linux kernel uses Memory Side Cache Information Structure + * without SMBIOS entries for now, so set Number of SMBIOS handles + * as 0. + */ + build_append_int_noprefix(table_data, 0, 2); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; uint32_t num_initiator = 0; uint32_t initiator_list[MAX_NODES]; - int i, hierarchy, type; + int i, hierarchy, type, cache_level, total_levels; HMAT_LB_Info *hmat_lb; + NumaHmatCacheOptions *hmat_cache; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -184,6 +232,25 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) } } } + + /* + * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: + * Table 5-147 + */ + for (i = 0; i < numa_state->num_nodes; i++) { + total_levels = 0; + for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) { + if (numa_state->hmat_cache[i][cache_level]) { + total_levels++; + } + } + for (cache_level = 0; cache_level <= total_levels; cache_level++) { + hmat_cache = numa_state->hmat_cache[i][cache_level]; + if (hmat_cache) { + build_hmat_cache(table_data, total_levels, hmat_cache); + } + } + } } void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) From d00817c944ed15fbe4a61d44fe7f9fe166c7df88 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Fri, 13 Dec 2019 09:19:28 +0800 Subject: [PATCH 17/32] tests/numa: Add case for QMP build HMAT Check configuring HMAT usecase Acked-by: Markus Armbruster Suggested-by: Igor Mammedov Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-8-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- tests/numa-test.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) diff --git a/tests/numa-test.c b/tests/numa-test.c index 8de8581231..17dd807d2a 100644 --- a/tests/numa-test.c +++ b/tests/numa-test.c @@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) qtest_quit(qs); } +static void pc_hmat_build_cfg(const void *data) +{ + QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + + /* Fail: Initiator should be less than the number of nodes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + + /* Fail: Target should be less than the number of nodes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + + /* Fail: Initiator should contain cpu */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + + /* Fail: Data-type mismatch */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"write-latency\"," + " 'bandwidth': 524288000 } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\"," + " 'latency': 5 } }"))); + + /* Fail: Bandwidth should be 1MB (1048576) aligned */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 1048575 } }"))); + + /* Configuring HMAT bandwidth and latency details */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 1 } }"))); /* 1 ns */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 5 } }"))); /* Fail: Duplicate configuration */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 68717379584 } }"))); /* 65534 MB/s */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 65534 } }"))); /* 65534 ns */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 34358689792 } }"))); /* 32767 MB/s */ + + /* Fail: node_id should be less than the number of nodes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 2, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* Fail: level should be less than HMAT_LB_LEVELS (4) */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 4, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* Fail: associativity option should be 'none', if level is 0 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"direct\", 'policy': \"none\"," + " 'line': 0 } }"))); + /* Fail: policy option should be 'none', if level is 0 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"none\", 'policy': \"write-back\"," + " 'line': 0 } }"))); + /* Fail: line option should be 0, if level is 0 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"none\", 'policy': \"none\"," + " 'line': 8 } }"))); + + /* Configuring HMAT memory side cache attributes */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); /* Fail: Duplicate configuration */ + /* Fail: The size of level 2 size should be small than level 1 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 2, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + /* Fail: The size of level 0 size should be larger than level 1 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 1, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* let machine initialization to complete and run */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, + "{ 'execute': 'x-exit-preconfig' }"))); + qtest_qmp_eventwait(qs, "RESUME"); + + qtest_quit(qs); +} + +static void pc_hmat_off_cfg(const void *data) +{ + QTestState *qs = qtest_initf("%s -nodefaults --preconfig " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0", + data ? (char *)data : ""); + + /* + * Fail: Enable HMAT with -machine hmat=on + * before using any of hmat specific options + */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\"," + " 'initiator': 0 } }"))); + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\" } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 1 } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* let machine initialization to complete and run */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, + "{ 'execute': 'x-exit-preconfig' }"))); + qtest_qmp_eventwait(qs, "RESUME"); + + qtest_quit(qs); +} + +static void pc_hmat_erange_cfg(const void *data) +{ + QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + + /* Can't store the compressed latency */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 1 } }"))); /* 1 ns */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 65535 } }"))); /* 65535 ns */ + + /* Test the 0 input (bandwidth not provided) */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 0 } }"))); /* 0 MB/s */ + /* Fail: bandwidth should be provided before memory side cache attributes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* Can't store the compressed bandwidth */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 68718428160 } }"))); /* 65535 MB/s */ + + /* let machine initialization to complete and run */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, + "{ 'execute': 'x-exit-preconfig' }"))); + qtest_qmp_eventwait(qs, "RESUME"); + + qtest_quit(qs); +} + int main(int argc, char **argv) { const char *args = NULL; @@ -346,6 +556,9 @@ int main(int argc, char **argv) if (!strcmp(arch, "i386") || !strcmp(arch, "x86_64")) { qtest_add_data_func("/numa/pc/cpu/explicit", args, pc_numa_cpu); qtest_add_data_func("/numa/pc/dynamic/cpu", args, pc_dynamic_cpu_cfg); + qtest_add_data_func("/numa/pc/hmat/build", args, pc_hmat_build_cfg); + qtest_add_data_func("/numa/pc/hmat/off", args, pc_hmat_off_cfg); + qtest_add_data_func("/numa/pc/hmat/erange", args, pc_hmat_erange_cfg); } if (!strcmp(arch, "ppc64")) { From 1c8f85d93d261dc555a0aad6f54f2b5e8009d859 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Fri, 13 Dec 2019 09:19:29 +0800 Subject: [PATCH 18/32] tests/bios-tables-test: add test cases for ACPI HMAT ACPI table HMAT has been introduced, QEMU now builds HMAT tables for Heterogeneous Memory with boot option '-numa node'. Add test cases on PC and Q35 machines with 2 numa nodes. Because HMAT is generated when system enable numa, the following tables need to be added for this test: tests/data/acpi/pc/APIC.acpihmat tests/data/acpi/pc/SRAT.acpihmat tests/data/acpi/pc/HMAT.acpihmat tests/data/acpi/pc/DSDT.acpihmat tests/data/acpi/q35/APIC.acpihmat tests/data/acpi/q35/SRAT.acpihmat tests/data/acpi/q35/HMAT.acpihmat tests/data/acpi/q35/DSDT.acpihmat Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jingqi Liu Suggested-by: Igor Mammedov Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-9-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/bios-tables-test-allowed-diff.h | 8 +++++ tests/bios-tables-test.c | 44 +++++++++++++++++++++++++++ tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 10 files changed, 52 insertions(+) create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.acpihmat create mode 100644 tests/data/acpi/q35/APIC.acpihmat create mode 100644 tests/data/acpi/q35/DSDT.acpihmat create mode 100644 tests/data/acpi/q35/HMAT.acpihmat create mode 100644 tests/data/acpi/q35/SRAT.acpihmat diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h index dfb8523c8b..3c9e0c979b 100644 --- a/tests/bios-tables-test-allowed-diff.h +++ b/tests/bios-tables-test-allowed-diff.h @@ -1 +1,9 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/APIC.acpihmat", +"tests/data/acpi/pc/SRAT.acpihmat", +"tests/data/acpi/pc/HMAT.acpihmat", +"tests/data/acpi/pc/DSDT.acpihmat", +"tests/data/acpi/q35/APIC.acpihmat", +"tests/data/acpi/q35/SRAT.acpihmat", +"tests/data/acpi/q35/HMAT.acpihmat", +"tests/data/acpi/q35/DSDT.acpihmat", diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index bc0ad594a1..f1ac2d7e96 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) } +static void test_acpi_tcg_acpi_hmat(const char *machine) +{ + test_data data; + + memset(&data, 0, sizeof(data)); + data.machine = machine; + data.variant = ".acpihmat"; + test_acpi_one(" -machine hmat=on" + " -smp 2,sockets=2" + " -m 128M,slots=2,maxmem=1G" + " -object memory-backend-ram,size=64M,id=m0" + " -object memory-backend-ram,size=64M,id=m1" + " -numa node,nodeid=0,memdev=m0" + " -numa node,nodeid=1,memdev=m1,initiator=0" + " -numa cpu,node-id=0,socket-id=0" + " -numa cpu,node-id=0,socket-id=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-latency,latency=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=65534M" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-latency,latency=65534" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=32767M" + " -numa hmat-cache,node-id=0,size=10K,level=1," + "associativity=direct,policy=write-back,line=8" + " -numa hmat-cache,node-id=1,size=10K,level=1," + "associativity=direct,policy=write-back,line=8", + &data); + free_test_data(&data); +} + +static void test_acpi_q35_tcg_acpi_hmat(void) +{ + test_acpi_tcg_acpi_hmat(MACHINE_Q35); +} + +static void test_acpi_piix4_tcg_acpi_hmat(void) +{ + test_acpi_tcg_acpi_hmat(MACHINE_PC); +} + static void test_acpi_virt_tcg(void) { test_data data = { @@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem); qtest_add_func("acpi/piix4/dimmpxm", test_acpi_piix4_tcg_dimm_pxm); qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm); + qtest_add_func("acpi/piix4/acpihmat", test_acpi_piix4_tcg_acpi_hmat); + qtest_add_func("acpi/q35/acpihmat", test_acpi_q35_tcg_acpi_hmat); } else if (strcmp(arch, "aarch64") == 0) { qtest_add_func("acpi/virt", test_acpi_virt_tcg); qtest_add_func("acpi/virt/numamem", test_acpi_virt_tcg_numamem); diff --git a/tests/data/acpi/pc/APIC.acpihmat b/tests/data/acpi/pc/APIC.acpihmat new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/pc/HMAT.acpihmat b/tests/data/acpi/pc/HMAT.acpihmat new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/pc/SRAT.acpihmat b/tests/data/acpi/pc/SRAT.acpihmat new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/q35/APIC.acpihmat b/tests/data/acpi/q35/APIC.acpihmat new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/q35/HMAT.acpihmat b/tests/data/acpi/q35/HMAT.acpihmat new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/q35/SRAT.acpihmat b/tests/data/acpi/q35/SRAT.acpihmat new file mode 100644 index 0000000000..e69de29bb2 From 48892c6c8def6624a0ed57e2bd6c2a0a9878b973 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 19 Dec 2019 08:17:20 -0500 Subject: [PATCH 19/32] ACPI: add expected files for HMAT tests (acpihmat) Signed-off-by: Michael S. Tsirkin --- tests/bios-tables-test-allowed-diff.h | 8 -------- tests/data/acpi/pc/APIC.acpihmat | Bin 0 -> 128 bytes tests/data/acpi/pc/DSDT.acpihmat | Bin 0 -> 6455 bytes tests/data/acpi/pc/HMAT.acpihmat | Bin 0 -> 280 bytes tests/data/acpi/pc/SRAT.acpihmat | Bin 0 -> 280 bytes tests/data/acpi/q35/APIC.acpihmat | Bin 0 -> 128 bytes tests/data/acpi/q35/DSDT.acpihmat | Bin 0 -> 9203 bytes tests/data/acpi/q35/HMAT.acpihmat | Bin 0 -> 280 bytes tests/data/acpi/q35/SRAT.acpihmat | Bin 0 -> 280 bytes 9 files changed, 8 deletions(-) diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h index 3c9e0c979b..dfb8523c8b 100644 --- a/tests/bios-tables-test-allowed-diff.h +++ b/tests/bios-tables-test-allowed-diff.h @@ -1,9 +1 @@ /* List of comma-separated changed AML files to ignore */ -"tests/data/acpi/pc/APIC.acpihmat", -"tests/data/acpi/pc/SRAT.acpihmat", -"tests/data/acpi/pc/HMAT.acpihmat", -"tests/data/acpi/pc/DSDT.acpihmat", -"tests/data/acpi/q35/APIC.acpihmat", -"tests/data/acpi/q35/SRAT.acpihmat", -"tests/data/acpi/q35/HMAT.acpihmat", -"tests/data/acpi/q35/DSDT.acpihmat", diff --git a/tests/data/acpi/pc/APIC.acpihmat b/tests/data/acpi/pc/APIC.acpihmat index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a21f164699bfccd8992ea1bdb5717f2dc3025496 100644 GIT binary patch literal 128 zcmZ<^@N{lqU|?Xp<>c?|5v<@85#a0y6k`O6f!H9Lf#JbFFwFr}2jX%tGJA8l)n3DUlYJT5~Da#Tw;OmQgB5 z;e`?xQH-Fp0w_-I0@g(f^nx~cZ9hWu2zi9`6;d?uRn+h7awwY80?9=Qh?+C!o9~>N zIp@p_4clm3JxpZZtF6 zQqyh}m`6RXM_sK?U7@Go8VvOxF_0>z{4Y}*=owjb~^1iRBDC2O&%H{P469?*Yd<3S)Dt4h6;IOcSyOPx- z!WD4$wLe@U78=P|`7%3EwMsS4-eFO_K#izg#6ML(cR4Bz6PvU5R=uHvG+44E7K{9y z6ECfDk9kauEHJ*xci0Y#Ozbje@9J977{a4bE#a@qaH9S|m${5%)E3*q|Ah$V>+HR5 zu5SznPS1`HR78A%sRS%2D~3MY#1jLL=EdA9|33PCl*Ly0kI^5oPz%fKV$A2xtyHao z-0T&jGLuo6h>LZR<>Hx$h*$A9Bj&|{_)z>HriG$3SBz5nl+Y*)M?Vn=&ffb~t1uE{d-UPP0&mPByb5_OVD*q(BW%2Nc(JOlG3`}LFI|f`=Sey^@YH<&jQ>9`(iJt-z0wM57Jv?U^J)4RXZ+GHZiZuivg zZGaL;n`&*%U|YQl-P^pE?zTj1*ln||$CE>;08qMnTSSIE#X(PW*rT&8@3Y-ap)w>c zd$`4zcfSRD54Sk;wjR1IcCXcUod*{#N6A~t70Nbl)vsq2eC6nCk-qYZHe0!lRqZA2 zi%uI!pXiIEwp6*U*AoELv*{_3{BnXN{9xN>e`_EP71Ng*)#Txr%OxH~VrDldeQQa<8Ge8=JMn+3DE47N^G3vzH=Wh8`3cdvXO%`;klFjC_}?-2FHc0qdkOl%cqf+NSnr(A-=_dWw&8kU!5Ae#H60Rg?h_=Nlq`P6jF9mmnKqB)ShY&cQg#u0?%&`j zQ7BN5I`NVGHp^B}!6vPml`UkK;4co4N%PL1?(Xbn+DFnnxbOw}EpV#LQGS;#I_{Ye zbGVO{g3JVSP@2b>jf!bzw(k6Sn~H{F8nwEJI1et3EHwE)Jt(s(vC}! zP9%M zs8J=79qOhAjU~ZX)99Y|i26vsLo)X}(|#xac-TX&D(R{{<3yLb5kU3)V)~po`^Blz zDbrMGnlwd!dig~mK;Oii(44~9LGvUWYI z9!00zR9E{I0;>@_0@|ja^FE-c3n;Y(#Az1y;TN|PVS?}tAV}K@)5_wVL)n@A1f^zp z@^|E&fr5hckg4+YDW(Q8)tEl{?8|AYKvD9JMAQw8pprsv_&o!2c#{mw<^Hecw6i8M zV{lN@GzSMYnmDKrt65%FGb%2WlInVl3Z1_cgBt)!nD#CzUyjj9F?_uSPmPfdORxE* z`~`%nYeDs*OMFo3mWBA|!$}swQ=bBgzVKFko_e0*3i^4FXFo#yJT;Uj{qXzGXiy3S zBVEAh6Jfq4DE5pg2M7Q9DbRfL!;#$mhG2-}-_m1q$81!w)DkM~dLM1B#FK{3k23?Y@DG5!5LM7{@gieM*uS(V_37ras zN>;q117D}ZpzlalRYKKJsANq`XgUn~u4J8&(3w!EWSy1J*)V89vLf<>X=$NQ$@-Fn zz7z&6O4gGSdNLF$Sx-smsW7N6S?45lE)*(RPfO_OFsLC}&q(N*P^e^`m(ckzXi2i3 zmC&=HP|5nTguWaGU6HJ>Na!n}P|12uLeCus)ynh6jannpOWkkdA+k@kZrc}B2&iU( zkRd^8mLZ@b1)2^Cq?x?mPU01_Z=MiD(R;0w^3bjitO7+I4R>CfqbaPX|iu4b)+6IEy#y@m1HD=)MtG8P`^wv!ddD& zzI>?nXNek Qg_O1sXWa%LGz<~_AAR>eTL1t6 literal 0 HcmV?d00001 diff --git a/tests/data/acpi/pc/HMAT.acpihmat b/tests/data/acpi/pc/HMAT.acpihmat index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c00f7ba6cd0acecbc4b158f430d29b2f32988522 100644 GIT binary patch literal 280 zcmeb9bqtYUWME*L;pFe^5v<@85#a0r6axw|fY=}!1~h;SWIjwBokmuNOFc;30ICth zW`eR`Fhdzga*PcB{=?M+<&o8c*vJ6H|M&lYJ;*!-WC@U?fIv`?15Cr@;rh|!0TAyG A0RR91 literal 0 HcmV?d00001 diff --git a/tests/data/acpi/pc/SRAT.acpihmat b/tests/data/acpi/pc/SRAT.acpihmat index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1dcae90aec688e88f9d212e632faaaaf2e0dc7bc 100644 GIT binary patch literal 280 zcmWFzatx7RWME)C;N1+c3F q8VCj-m|+T0)xmizPc?|5v<@85#a0y6k`O6f!H9Lf#JbFFwFr}2jX%tGJl1TZsreZQ3+HD9Lq;HVsVfQY0;|Op$crfCjjB<5o&q zWT7}k(jb7O0QuoVfrNF?-snJw-g@gb{u6R*fS!8oH7Md!)c4Ko$TLd{h!53)oNxBM z@0+(fGjFv^zvFkmxxkneRIYmUPO);m<@xBd7-Q6?Z>N#D!FqdsrPjA{sf^Xz&xtlZ zDz^KqU%6JZ{<0l@9)@>53ay(FyY+>0@7B%egO9^oj6iSSia4i+v%&kFO22pD@s`&p z+5LLM&zHaAw%zw+D{c47ew*3-YYpFWo0I*k9WQhD4d(f;tPD3N2HS;s?(~9xt$n+^ zboujF?vx+=>Yu;4`v%Vdu!?UR-)j+lgztrXIUG8l4R);ei7t+<4Cg-^h{Lkap(9a9 zJ@@mni&Cu|zTpJ69Bb9Cx2jz=RtqD<*l<4Tt!}{bjD7W8j%9lL4#o2?S2z7)tL^uT z?xlXGTV@gUgb{V!{A0+SaG3ve5VAp-J32aK!XDpf7<-pFmnWa6;m~Qr>B}}c<-Ryo7{D?H(`vN0Qat4O-d+<|Fva(Hs<(fJ z+RVEel+(<@R|Q|qR@YAnR5is92z3gmD)Y+KP0Op`quIiTbNTcOX;qP`^$wnEcRdu9 z*DQx?L?d0~r)pNjBIcydGplCpvR#(SzRP+CKijDI$MAI8of7BcSfU_?EMyi~ud&zg zeLe-Hy@RKNtJjW+v-3%!%_q(?hk!3Z%P}y++(miDY5d_Zi?e*l?q`k*I()ijy_0?? zZQJMfM4@U1=VV1Gny}(o7pI{Ua#EUy>4#}%pLax>bxO0ENW)WVICM@=%#(VULLM7= z>{i2DXKucaZ!6lS$obQ&7F`0z`;S;It#-FHxAE5ATrPvz!?Ue?l6%tcb!MvAPktZXV3Yw8jHF$)&&Iv>3gkUP@gk-A0Bh+N>GQXlHuL&^cx5M5ycJhE8tiN=+lozsTSX;UXcUFVFUbH>m)W9mex>zp-o z&Kf#rO`QmJosOZ?F?2enPK3J7f}yiu=q#8z5$ZbU44rd^&N)*jLS5&)p>y8QIdAGj zsOxkMovxwNHFYA?b)GVGo-%ZvGIb)7Yv;XrcQ*q&eMj@(}vE|rcQ*q&P7A# zqM>uq)QM2nc}6hH@lJR~Fe~w9cSbV3M#1vZ4{0T6amW0AdNs3g@H<(up|rjI|8U2GbN!aB&L!E1C>~1pc<46 zR6+%l3=|$|V8TE(C>f}P3MLsSLZuT1DzSnI1J$5ppb{#WWS|I@P8g`f z3MLFxgOY(ts9=(TB2+qIpb{&XFi;Ij1}dR~Nd}5g>4bqwtYE@GH7FUVgbF4ZC_<$Z z1}d?F2?N!jWS|l%m}H;`l};F_#0n-1RD+U%N~mCxfg)5oVW1K#m@rTcN(L&Sf=LF7 zQ0at$N~~bQKs6{CsDuh887M-f69y`=f(ZlFpk$yDDwt%T2$fD4sKg2;3{-=Xfl8=g zl7S*rI$@v^E0{1)4N3+op@K;Uicsl$|V8TE( zC>f}P3MLsSLZuT1DzSnI1J$5ppb{#WWS|I@P8g`f3MLFxgOY(ts9=(TB2+qIpolaB zMWh=jLft?SY6hw?VW1k53{+#1foe<`sK$hWYD_XvjY$ToF=3z@69%d=$v`zG8K}mD zfoe<`sKz7%)tF?U8WRSJNL^hRC?Z@jaWct35#lnakeKS+!axzJa|;7Sq|PlFC_;5^ z$v_eM%stzOg|cCOApTiBq<={7XX&fl+yBK2Y5Fgfo)ys32&>iJD#LNND)iyehi@}H z8`-K-d!0UN^ij9n`NmeKYYnH~<9pk92r_lZbMD2)me-yFpj|4=@Z2X^HboDk@SpNs z>f;mmY((#E^I1kM45E!EBh5+NBb3$u;(0typ@%g#hTSPF@cB<>*i;(s#;|`g?8M0C z&dTQ!Gz5&M>=ycdqr={(Hs2FHv-G5c9nyeAe7fIhcdWbH@ffH)^~cpKiRyT)@V%>7 zwCWX6y@K)S)hq7!>XqI>K-#@Tb)I^R@}5@S6Xm@`dGCbsUR>T!mdE3=@2%d~%KM_c zpD6F2P~MNrSCi#4k5Rs=m9L8O)kOK~3FWJC`SoP^iN`3vu9aUG<<}GC*H0+F9+$5r z%V!^>d`&A~6Xk1(^0gDn*W&W^WckU*C|}pg*G2hyqI~^?@^zHY&{mc#@5u6M^T2Hd zJ!TV6EzLeRE0#RIXkEwYlFOJE%yhWd7)dAl#-ro;WqNIKa!Hr>d)UNF<)re-9a>>HbIWS*Wj z)8S5MB%SOt(>XuT9?75S1eN%OM%H0i_jrwwPvhCVbLB<(9>L=`9z9WSwW}R9Upba3 zTx!~DT`Sh#yc7NI?cWx6Z@=~C+q=KM{TAz4UaPqg&)aFvYu4BKuk%(O55VD0u+7}p zc|7*tzd%8m6+L=-?=?0%me&qmw!G?e~v(DuTyOnf%MDWA|5z5HXC-t zs1s9v<%ST2vB>dFm%Ub_R9Y#~2;@7O33tisuD_ZZ^1(Ly&?}WnJJhtOxw7NdvmORA z)w^)xg^YTEKypdO(NVOW>_**)g~PLhA@!7EV^vIFs#$o6N6D02O3R%fi$S^j) zTc7bd-llz$3@iMioEwCA^$|uanG`2`Sj&tE5*Gz$E(ynVu%eNX0UcT4{f*49KAK8g z+VQD)dNdc{$TePQFy&qCbn_<(9ece-1`EZ}&KO-0PbGT^`g*bxr(o>pdLO(*6AUX7 zf0(P7S@t2#FHFav+%4p|&G7#}-5I(wcv;xZkr=kU^E%e_^?>z0*vQbd6t|8>0^F7z zC$KD$AlIkGB36u+%sNiyp2Om(wW|GUX%*L8ydI;4LK6i`XF^-wLA@on=;_vGt5hys z!gy(o2e*I!=I*=2?U7|JeMziu+?;Z@Nb9@a3fdN)0`SG&LAfedCNfJeNo4jLK~Q|{ z_8)d=)35`7vrX^zp4Uz^jfpa*gLb7^#77+;O=>mKYF+Oad;8Ch!I#-(9#lJhE1(G+ z(9{d6SNeDl{L~n9I-uPpxLSP~A04mg&Wro&;naY}gjaI7x$jv))7xNfIUC^Au}8XXzbmHOiLrd+G+1CIuZwOgqmZ z&8ax1J?4)C(cBC1AraODHr4xB!9bslTx^nc;BEfH{KLFu zqu-J<=UW_pm_LrE#T@!ZpIIC;ivAs^zZ4TkFY-VnVp8>3AMz-Z` HcG>>`zx=Ox literal 0 HcmV?d00001 diff --git a/tests/data/acpi/q35/HMAT.acpihmat b/tests/data/acpi/q35/HMAT.acpihmat index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c00f7ba6cd0acecbc4b158f430d29b2f32988522 100644 GIT binary patch literal 280 zcmeb9bqtYUWME*L;pFe^5v<@85#a0r6axw|fY=}!1~h;SWIjwBokmuNOFc;30ICth zW`eR`Fhdzga*PcB{=?M+<&o8c*vJ6H|M&lYJ;*!-WC@U?fIv`?15Cr@;rh|!0TAyG A0RR91 literal 0 HcmV?d00001 diff --git a/tests/data/acpi/q35/SRAT.acpihmat b/tests/data/acpi/q35/SRAT.acpihmat index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1dcae90aec688e88f9d212e632faaaaf2e0dc7bc 100644 GIT binary patch literal 280 zcmWFzatx7RWME)C;N1+c3F q8VCj-m|+T0)xmizP Date: Fri, 13 Dec 2019 10:54:10 +0100 Subject: [PATCH 20/32] virtio-mmio: Clear v2 transport state on soft reset At the moment when the guest writes a status of 0, we only reset the virtio core state but not the virtio-mmio state. The virtio-mmio specification says (v1.1 cs01, 4.2.2.1 Device Requirements: MMIO Device Register Layout): Upon reset, the device MUST clear all bits in InterruptStatus and ready bits in the QueueReady register for all queues in the device. The core already takes care of InterruptStatus by clearing isr, but we still need to clear QueueReady. It would be tempting to clean all registers, but since the specification doesn't say anything more, guests could rely on the registers keeping their state across reset. Linux for example, relies on this for GuestPageSize in the legacy MMIO tranport. Fixes: 44e687a4d9ab ("virtio-mmio: implement modern (v2) personality (virtio-1)") Signed-off-by: Jean-Philippe Brucker Message-Id: <20191213095410.1516119-1-jean-philippe@linaro.org> Reviewed-by: Sergio Lopez Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-mmio.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 94d934c44b..ef40b7a9b2 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -65,6 +65,19 @@ static void virtio_mmio_stop_ioeventfd(VirtIOMMIOProxy *proxy) virtio_bus_stop_ioeventfd(&proxy->bus); } +static void virtio_mmio_soft_reset(VirtIOMMIOProxy *proxy) +{ + int i; + + if (proxy->legacy) { + return; + } + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + proxy->vqs[i].enabled = 0; + } +} + static uint64_t virtio_mmio_read(void *opaque, hwaddr offset, unsigned size) { VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque; @@ -378,6 +391,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, if (vdev->status == 0) { virtio_reset(vdev); + virtio_mmio_soft_reset(proxy); } break; case VIRTIO_MMIO_QUEUE_DESC_LOW: From 4ce537a7165a33a09f47587299f39f6fe647cde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 16 Dec 2019 01:21:33 +0100 Subject: [PATCH 21/32] hw/pci/pci_host: Remove redundant PCI_DPRINTF() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 3bf4dfdd111 we introduced the pci_cfg_[read/write] trace events in pci_host_config_[read/write]_common(). We have the following call trace: pci_host_data_[read/write]() - PCI_DPRINTF() - pci_data_[read/write]() - PCI_DPRINTF() - pci_host_config_[read/write]_common() trace_pci_cfg_[read/write]() Since the PCI_DPRINTF() calls are redundant with the trace events, remove them. Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20191216002134.18279-2-philmd@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci_host.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c index c5f9244934..0958d157de 100644 --- a/hw/pci/pci_host.c +++ b/hw/pci/pci_host.c @@ -115,8 +115,6 @@ void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len) return; } - PCI_DPRINTF("%s: %s: addr=%02" PRIx32 " val=%08" PRIx32 " len=%d\n", - __func__, pci_dev->name, config_addr, val, len); pci_host_config_write_common(pci_dev, config_addr, PCI_CONFIG_SPACE_SIZE, val, len); } @@ -125,18 +123,13 @@ uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len) { PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr); uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1); - uint32_t val; if (!pci_dev) { return ~0x0; } - val = pci_host_config_read_common(pci_dev, config_addr, - PCI_CONFIG_SPACE_SIZE, len); - PCI_DPRINTF("%s: %s: addr=%02"PRIx32" val=%08"PRIx32" len=%d\n", - __func__, pci_dev->name, config_addr, val, len); - - return val; + return pci_host_config_read_common(pci_dev, config_addr, + PCI_CONFIG_SPACE_SIZE, len); } static void pci_host_config_write(void *opaque, hwaddr addr, @@ -167,8 +160,7 @@ static void pci_host_data_write(void *opaque, hwaddr addr, uint64_t val, unsigned len) { PCIHostState *s = opaque; - PCI_DPRINTF("write addr " TARGET_FMT_plx " len %d val %x\n", - addr, len, (unsigned)val); + if (s->config_reg & (1u << 31)) pci_data_write(s->bus, s->config_reg | (addr & 3), val, len); } @@ -177,14 +169,11 @@ static uint64_t pci_host_data_read(void *opaque, hwaddr addr, unsigned len) { PCIHostState *s = opaque; - uint32_t val; + if (!(s->config_reg & (1U << 31))) { return 0xffffffff; } - val = pci_data_read(s->bus, s->config_reg | (addr & 3), len); - PCI_DPRINTF("read addr " TARGET_FMT_plx " len %d val %x\n", - addr, len, val); - return val; + return pci_data_read(s->bus, s->config_reg | (addr & 3), len); } const MemoryRegionOps pci_host_conf_le_ops = { From f2a7e8f170252081ff48b99f34dbd1f0211d7938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 16 Dec 2019 01:21:34 +0100 Subject: [PATCH 22/32] hw/pci/pci_host: Let pci_data_[read/write] use unsigned 'size' argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both functions are called by MemoryRegionOps.[read/write] handlers with unsigned 'size' argument. Both functions call pci_host_config_[read/write]_common() which expect a uint32_t 'len' parameter (also unsigned). Since it is pointless (and confuse) to use a signed value, use a unsigned type. Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20191216002134.18279-3-philmd@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci_host.c | 4 ++-- include/hw/pci/pci_host.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c index 0958d157de..ce7bcdb1d5 100644 --- a/hw/pci/pci_host.c +++ b/hw/pci/pci_host.c @@ -106,7 +106,7 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, return ret; } -void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len) +void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, unsigned len) { PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr); uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1); @@ -119,7 +119,7 @@ void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len) val, len); } -uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len) +uint32_t pci_data_read(PCIBus *s, uint32_t addr, unsigned len) { PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr); uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1); diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h index ba31595fc7..9ce088bd13 100644 --- a/include/hw/pci/pci_host.h +++ b/include/hw/pci/pci_host.h @@ -62,8 +62,8 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr, uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, uint32_t limit, uint32_t len); -void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len); -uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len); +void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, unsigned len); +uint32_t pci_data_read(PCIBus *s, uint32_t addr, unsigned len); extern const MemoryRegionOps pci_host_conf_le_ops; extern const MemoryRegionOps pci_host_conf_be_ops; From d91d57e604edc128be302b60dabba6a34f0e0f0f Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Tue, 29 Oct 2019 17:38:02 -0400 Subject: [PATCH 23/32] vhost-user: add VHOST_USER_RESET_DEVICE to reset devices Add a VHOST_USER_RESET_DEVICE message which will reset the vhost user backend. Disabling all rings, and resetting all internal state, ready for the backend to be reinitialized. A backend has to report it supports this features with the VHOST_USER_PROTOCOL_F_RESET_DEVICE protocol feature bit. If it does so, the new message is used instead of sending a RESET_OWNER which has had inconsistent implementations. Signed-off-by: David Vrabel Signed-off-by: Raphael Norwitz Message-Id: <1572385083-5254-2-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/interop/vhost-user.rst | 15 +++++++++++++++ hw/virtio/vhost-user.c | 8 +++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 015ac08177..5f8b3a456b 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -785,6 +785,7 @@ Protocol features #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 + #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13 Master message types -------------------- @@ -1190,6 +1191,20 @@ Master message types ancillary data. The GPU protocol is used to inform the master of rendering state and updates. See vhost-user-gpu.rst for details. +``VHOST_USER_RESET_DEVICE`` + :id: 34 + :equivalent ioctl: N/A + :master payload: N/A + :slave payload: N/A + + Ask the vhost user backend to disable all rings and reset all + internal device state to the initial state, ready to be + reinitialized. The backend retains ownership of the device + throughout the reset operation. + + Only valid if the ``VHOST_USER_PROTOCOL_F_RESET_DEVICE`` protocol + feature is set by the backend. + Slave message types ------------------- diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 02a9b25199..d27a10fcc6 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -58,6 +58,7 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13, VHOST_USER_PROTOCOL_F_MAX }; @@ -98,6 +99,7 @@ typedef enum VhostUserRequest { VHOST_USER_GET_INFLIGHT_FD = 31, VHOST_USER_SET_INFLIGHT_FD = 32, VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_RESET_DEVICE = 34, VHOST_USER_MAX } VhostUserRequest; @@ -890,10 +892,14 @@ static int vhost_user_set_owner(struct vhost_dev *dev) static int vhost_user_reset_device(struct vhost_dev *dev) { VhostUserMsg msg = { - .hdr.request = VHOST_USER_RESET_OWNER, .hdr.flags = VHOST_USER_VERSION, }; + msg.hdr.request = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RESET_DEVICE) + ? VHOST_USER_RESET_DEVICE + : VHOST_USER_RESET_OWNER; + if (vhost_user_write(dev, &msg, NULL, 0) < 0) { return -1; } From f0472439383bb5bf5d760fb96e084beccbec03e4 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Tue, 29 Oct 2019 17:38:03 -0400 Subject: [PATCH 24/32] vhost-user-scsi: reset the device if supported If the vhost-user-scsi backend supports the VHOST_USER_F_RESET_DEVICE protocol feature, then the device can be reset when requested. If this feature is not supported, do not try a reset as this will send a VHOST_USER_RESET_OWNER that the backend is not expecting, potentially putting into an inoperable state. Signed-off-by: David Vrabel Signed-off-by: Raphael Norwitz Message-Id: <1572385083-5254-3-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/scsi/vhost-user-scsi.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index 6a6c15dd32..23f972df59 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -39,6 +39,10 @@ static const int user_feature_bits[] = { VHOST_INVALID_FEATURE_BIT }; +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13, +}; + static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserSCSI *s = (VHostUserSCSI *)vdev; @@ -62,6 +66,25 @@ static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) } } +static void vhost_user_scsi_reset(VirtIODevice *vdev) +{ + VHostSCSICommon *vsc = VHOST_SCSI_COMMON(vdev); + struct vhost_dev *dev = &vsc->dev; + + /* + * Historically, reset was not implemented so only reset devices + * that are expecting it. + */ + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RESET_DEVICE)) { + return; + } + + if (dev->vhost_ops->vhost_reset_device) { + dev->vhost_ops->vhost_reset_device(dev); + } +} + static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) { } @@ -182,6 +205,7 @@ static void vhost_user_scsi_class_init(ObjectClass *klass, void *data) vdc->get_features = vhost_scsi_common_get_features; vdc->set_config = vhost_scsi_common_set_config; vdc->set_status = vhost_user_scsi_set_status; + vdc->reset = vhost_user_scsi_reset; fwc->get_dev_path = vhost_scsi_common_get_fw_dev_path; } From d940208cbbbecd52bb6bac539c5e486bcc07abba Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Mon, 23 Dec 2019 10:28:56 +0300 Subject: [PATCH 25/32] hw: fix using 4.2 compat in 5.0 machine types for i440fx/q35 5.0 machine type uses 4.2 compats. This seems to be incorrect, since the latests machine type by now is 5.0 and it should use its own compat or shouldn't use any relying on the defaults. Seems, like this appeared because of some problems on merge/rebase. Signed-off-by: Denis Plotnikov Message-Id: <20191223072856.5369-1-dplotnikov@virtuozzo.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc_piix.c | 1 - hw/i386/pc_q35.c | 1 - 2 files changed, 2 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 721c7aa64e..fa12203079 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -425,7 +425,6 @@ static void pc_i440fx_5_0_machine_options(MachineClass *m) m->alias = "pc"; m->is_default = 1; pcmc->default_cpu_version = 1; - compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len); } DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 52f45735e4..84cf925cf4 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -354,7 +354,6 @@ static void pc_q35_5_0_machine_options(MachineClass *m) pc_q35_machine_options(m); m->alias = "q35"; pcmc->default_cpu_version = 1; - compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len); } DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL, From 1bf8a989a566b2ba41c197004ec2a02562a766a4 Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Fri, 20 Dec 2019 17:09:04 +0300 Subject: [PATCH 26/32] virtio: make seg_max virtqueue size dependent Before the patch, seg_max parameter was immutable and hardcoded to 126 (128 - 2) without respect to queue size. This has two negative effects: 1. when queue size is < 128, we have Virtio 1.1 specfication violation: (2.6.5.3.1 Driver Requirements) seq_max must be <= queue_size. This violation affects the old Linux guests (ver < 4.14). These guests crash on these queue_size setups. 2. when queue_size > 128, as was pointed out by Denis Lunev , seg_max restrics guest's block request length which affects guests' performance making them issues more block request than needed. https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html To mitigate this two effects, the patch adds the property adjusting seg_max to queue size automaticaly. Since seg_max is a guest visible parameter, the property is machine type managable and allows to choose between old (seg_max = 126 always) and new (seg_max = queue_size - 2) behaviors. Not to change the behavior of the older VMs, prevent setting the default seg_max_adjust value for older machine types. Reviewed-by: Stefan Hajnoczi Signed-off-by: Denis Plotnikov Message-Id: <20191220140905.1718-2-dplotnikov@virtuozzo.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/block/virtio-blk.c | 9 ++++++++- hw/core/machine.c | 3 +++ hw/scsi/vhost-scsi.c | 2 ++ hw/scsi/virtio-scsi.c | 10 +++++++++- include/hw/virtio/virtio-blk.h | 1 + include/hw/virtio/virtio-scsi.h | 1 + 6 files changed, 24 insertions(+), 2 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index b12157b5eb..9bee514c4e 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -913,7 +913,8 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) blk_get_geometry(s->blk, &capacity); memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); - virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2); + virtio_stl_p(vdev, &blkcfg.seg_max, + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls); virtio_stl_p(vdev, &blkcfg.blk_size, blk_size); virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size); @@ -1138,6 +1139,11 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) error_setg(errp, "num-queues property must be larger than 0"); return; } + if (conf->queue_size <= 2) { + error_setg(errp, "invalid queue-size property (%" PRIu16 "), " + "must be > 2", conf->queue_size); + return; + } if (!is_power_of_2(conf->queue_size) || conf->queue_size > VIRTQUEUE_MAX_SIZE) { error_setg(errp, "invalid queue-size property (%" PRIu16 "), " @@ -1267,6 +1273,7 @@ static Property virtio_blk_properties[] = { true), DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128), + DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true), DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD, IOThread *), DEFINE_PROP_BIT64("discard", VirtIOBlock, host_features, diff --git a/hw/core/machine.c b/hw/core/machine.c index f5e2b32b3b..ec2e3fcb61 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -29,6 +29,9 @@ GlobalProperty hw_compat_4_2[] = { { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, + { "virtio-blk-device", "seg-max-adjust", "off"}, + { "virtio-scsi-device", "seg_max_adjust", "off"}, + { "vhost-blk-device", "seg_max_adjust", "off"}, }; const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2); diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index c693fc748a..26f710d3ec 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -275,6 +275,8 @@ static Property vhost_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSICommon, conf.virtqueue_size, 128), + DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSICommon, conf.seg_max_adjust, + true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSICommon, conf.max_sectors, 0xFFFF), DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSICommon, conf.cmd_per_lun, 128), diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index f080545f48..4bc73a370e 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -659,7 +659,8 @@ static void virtio_scsi_get_config(VirtIODevice *vdev, VirtIOSCSICommon *s = VIRTIO_SCSI_COMMON(vdev); virtio_stl_p(vdev, &scsiconf->num_queues, s->conf.num_queues); - virtio_stl_p(vdev, &scsiconf->seg_max, 128 - 2); + virtio_stl_p(vdev, &scsiconf->seg_max, + s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 2); virtio_stl_p(vdev, &scsiconf->max_sectors, s->conf.max_sectors); virtio_stl_p(vdev, &scsiconf->cmd_per_lun, s->conf.cmd_per_lun); virtio_stl_p(vdev, &scsiconf->event_info_size, sizeof(VirtIOSCSIEvent)); @@ -898,6 +899,11 @@ void virtio_scsi_common_realize(DeviceState *dev, virtio_cleanup(vdev); return; } + if (s->conf.virtqueue_size <= 2) { + error_setg(errp, "invalid virtqueue_size property (= %" PRIu32 "), " + "must be > 2", s->conf.virtqueue_size); + return; + } s->cmd_vqs = g_new0(VirtQueue *, s->conf.num_queues); s->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE; s->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE; @@ -954,6 +960,8 @@ static Property virtio_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI, parent_obj.conf.virtqueue_size, 128), + DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI, + parent_obj.conf.seg_max_adjust, true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors, 0xFFFF), DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSI, parent_obj.conf.cmd_per_lun, diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 9c19f5b634..1e62f869b2 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -38,6 +38,7 @@ struct VirtIOBlkConf uint32_t request_merging; uint16_t num_queues; uint16_t queue_size; + bool seg_max_adjust; uint32_t max_discard_sectors; uint32_t max_write_zeroes_sectors; bool x_enable_wce_if_config_wce; diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h index 122f7c4b6f..24e768909d 100644 --- a/include/hw/virtio/virtio-scsi.h +++ b/include/hw/virtio/virtio-scsi.h @@ -48,6 +48,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig; struct VirtIOSCSIConf { uint32_t num_queues; uint32_t virtqueue_size; + bool seg_max_adjust; uint32_t max_sectors; uint32_t cmd_per_lun; #ifdef CONFIG_VHOST_SCSI From cec0242ab49d179c811d58b8c9c13d4e22e0e9c8 Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Fri, 20 Dec 2019 17:09:05 +0300 Subject: [PATCH 27/32] tests: add virtio-scsi and virtio-blk seg_max_adjust test It tests proper seg_max_adjust settings for all machine types except 'none', 'isapc', 'microvm' Signed-off-by: Denis Plotnikov Message-Id: <20191220140905.1718-3-dplotnikov@virtuozzo.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/acceptance/virtio_seg_max_adjust.py | 134 ++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100755 tests/acceptance/virtio_seg_max_adjust.py diff --git a/tests/acceptance/virtio_seg_max_adjust.py b/tests/acceptance/virtio_seg_max_adjust.py new file mode 100755 index 0000000000..5458573138 --- /dev/null +++ b/tests/acceptance/virtio_seg_max_adjust.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# +# Test virtio-scsi and virtio-blk queue settings for all machine types +# +# Copyright (c) 2019 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import sys +import os +import re + +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'python')) +from qemu.machine import QEMUMachine +from avocado_qemu import Test + +#list of machine types and virtqueue properties to test +VIRTIO_SCSI_PROPS = {'seg_max_adjust': 'seg_max_adjust'} +VIRTIO_BLK_PROPS = {'seg_max_adjust': 'seg-max-adjust'} + +DEV_TYPES = {'virtio-scsi-pci': VIRTIO_SCSI_PROPS, + 'virtio-blk-pci': VIRTIO_BLK_PROPS} + +VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 'virtio-scsi-pci,id=scsi0'], + 'virtio-blk-pci': ['-device', + 'virtio-blk-pci,id=scsi0,drive=drive0', + '-drive', + 'driver=null-co,id=drive0,if=none']} + + +class VirtioMaxSegSettingsCheck(Test): + @staticmethod + def make_pattern(props): + pattern_items = ['{0} = \w+'.format(prop) for prop in props] + return '|'.join(pattern_items) + + def query_virtqueue(self, vm, dev_type_name): + query_ok = False + error = None + props = None + + output = vm.command('human-monitor-command', + command_line = 'info qtree') + props_list = DEV_TYPES[dev_type_name].values(); + pattern = self.make_pattern(props_list) + res = re.findall(pattern, output) + + if len(res) != len(props_list): + props_list = set(props_list) + res = set(res) + not_found = props_list.difference(res) + not_found = ', '.join(not_found) + error = '({0}): The following properties not found: {1}'\ + .format(dev_type_name, not_found) + else: + query_ok = True + props = dict() + for prop in res: + p = prop.split(' = ') + props[p[0]] = p[1] + return query_ok, props, error + + def check_mt(self, mt, dev_type_name): + with QEMUMachine(self.qemu_bin) as vm: + vm.set_machine(mt["name"]) + for s in VM_DEV_PARAMS[dev_type_name]: + vm.add_args(s) + vm.launch() + query_ok, props, error = self.query_virtqueue(vm, dev_type_name) + + if not query_ok: + self.fail('machine type {0}: {1}'.format(mt['name'], error)) + + for prop_name, prop_val in props.items(): + expected_val = mt[prop_name] + self.assertEqual(expected_val, prop_val) + + @staticmethod + def seg_max_adjust_enabled(mt): + # machine types >= 5.0 should have seg_max_adjust = true + # others seg_max_adjust = false + mt = mt.split("-") + + # machine types with one line name and name like pc-x.x + if len(mt) <= 2: + return False + + # machine types like pc--x.x[.x] + ver = mt[2] + ver = ver.split("."); + + # versions >= 5.0 goes with seg_max_adjust enabled + major = int(ver[0]) + + if major >= 5: + return True + return False + + def test_machine_types(self): + # collect all machine types except 'none', 'isapc', 'microvm' + with QEMUMachine(self.qemu_bin) as vm: + vm.launch() + machines = [m['name'] for m in vm.command('query-machines')] + vm.shutdown() + machines.remove('none') + machines.remove('isapc') + machines.remove('microvm') + + for dev_type in DEV_TYPES: + # create the list of machine types and their parameters. + mtypes = list() + for m in machines: + if self.seg_max_adjust_enabled(m): + enabled = 'true' + else: + enabled = 'false' + mtypes.append({'name': m, + DEV_TYPES[dev_type]['seg_max_adjust']: enabled}) + + # test each machine type for a device type + for mt in mtypes: + self.check_mt(mt, dev_type) From 1049f4c62c4070618cc5defc9963c6a17ae7a5ae Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Tue, 24 Dec 2019 11:14:46 +0300 Subject: [PATCH 28/32] virtio-mmio: update queue size on guest write Some guests read back queue size after writing it. Always update the on size write otherwise they might be confused. Cc: qemu-stable@nongnu.org Signed-off-by: Denis Plotnikov Message-Id: <20191224081446.17003-1-dplotnikov@virtuozzo.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-mmio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index ef40b7a9b2..872f2cd237 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -308,8 +308,9 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, break; case VIRTIO_MMIO_QUEUE_NUM: trace_virtio_mmio_queue_write(value, VIRTQUEUE_MAX_SIZE); + virtio_queue_set_num(vdev, vdev->queue_sel, value); + if (proxy->legacy) { - virtio_queue_set_num(vdev, vdev->queue_sel, value); virtio_queue_update_rings(vdev, vdev->queue_sel); } else { proxy->vqs[vdev->queue_sel].num = value; From 421afd2fe8dd4603216cbf36081877c391f5a2a4 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 26 Dec 2019 06:36:48 +0200 Subject: [PATCH 29/32] virtio: reset region cache when on queue deletion https://bugzilla.redhat.com/show_bug.cgi?id=1708480 Fix leak of region reference that prevents complete device deletion on hot unplug. Cc: qemu-stable@nongnu.org Signed-off-by: Yuri Benditovich Message-Id: <20191226043649.14481-2-yuri.benditovich@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 95d8ff8508..7b861e0ca0 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2344,6 +2344,7 @@ void virtio_delete_queue(VirtQueue *vq) vq->handle_aio_output = NULL; g_free(vq->used_elems); vq->used_elems = NULL; + virtio_virtqueue_reset_region_cache(vq); } void virtio_del_queue(VirtIODevice *vdev, int n) From d945d9f1731244ef341f74ede93120fc9de35913 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 26 Dec 2019 06:36:49 +0200 Subject: [PATCH 30/32] virtio-net: delete also control queue when TX/RX deleted https://bugzilla.redhat.com/show_bug.cgi?id=1708480 If the control queue is not deleted together with TX/RX, it later will be ignored in freeing cache resources and hot unplug will not be completed. Cc: qemu-stable@nongnu.org Signed-off-by: Yuri Benditovich Message-Id: <20191226043649.14481-3-yuri.benditovich@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/virtio-net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index db3d7c38e6..f325440d01 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3101,7 +3101,8 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) for (i = 0; i < max_queues; i++) { virtio_net_del_queue(n, i); } - + /* delete also control vq */ + virtio_del_queue(vdev, max_queues * 2); qemu_announce_timer_del(&n->announce_timer, false); g_free(n->vqs); qemu_del_nic(n->nic); From a2e1cd41ccfe796529abfd1b6aeb1dd4393762a2 Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Fri, 3 Jan 2020 21:28:05 +0800 Subject: [PATCH 31/32] intel_iommu: a fix to vtd_find_as_from_bus_num() Ensure the return value of vtd_find_as_from_bus_num() is NULL by enforcing vtd_bus=NULL. This would help caller of vtd_find_as_from_bus_num() to decide if any further operation on the returned vtd_bus. Cc: qemu-stable@nongnu.org Cc: Kevin Tian Cc: Jacob Pan Cc: Peter Xu Cc: Yi Sun Signed-off-by: Liu Yi L Signed-off-by: Yi Sun Message-Id: <1578058086-4288-2-git-send-email-yi.l.liu@intel.com> Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index ee06993675..609b80750a 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -948,6 +948,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) return vtd_bus; } } + vtd_bus = NULL; } return vtd_bus; } From 56fc1e6ac6bde95bc0369d358587f2234d4dddad Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Fri, 3 Jan 2020 21:28:06 +0800 Subject: [PATCH 32/32] intel_iommu: add present bit check for pasid table entries The present bit check for pasid entry (pe) and pasid directory entry (pdire) were missed in previous commits as fpd bit check doesn't require present bit as "Set". This patch adds the present bit check for callers which wants to get a valid pe/pdire. Cc: qemu-stable@nongnu.org Cc: Kevin Tian Cc: Jacob Pan Cc: Peter Xu Cc: Yi Sun Reviewed-by: Peter Xu Signed-off-by: Liu Yi L Message-Id: <1578058086-4288-3-git-send-email-yi.l.liu@intel.com> Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 92 +++++++++++++++++++++++++++------- hw/i386/intel_iommu_internal.h | 1 + 2 files changed, 74 insertions(+), 19 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 609b80750a..a523ef0e65 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -686,9 +686,18 @@ static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, return true; } -static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base, - uint32_t pasid, - VTDPASIDDirEntry *pdire) +static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire) +{ + return pdire->val & 1; +} + +/** + * Caller of this function should check present bit if wants + * to use pdir entry for futher usage except for fpd bit check. + */ +static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, + uint32_t pasid, + VTDPASIDDirEntry *pdire) { uint32_t index; dma_addr_t addr, entry_size; @@ -703,18 +712,22 @@ static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base, return 0; } -static int vtd_get_pasid_entry(IntelIOMMUState *s, - uint32_t pasid, - VTDPASIDDirEntry *pdire, - VTDPASIDEntry *pe) +static inline bool vtd_pe_present(VTDPASIDEntry *pe) +{ + return pe->val[0] & VTD_PASID_ENTRY_P; +} + +static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, + uint32_t pasid, + dma_addr_t addr, + VTDPASIDEntry *pe) { uint32_t index; - dma_addr_t addr, entry_size; + dma_addr_t entry_size; X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); index = VTD_PASID_TABLE_INDEX(pasid); entry_size = VTD_PASID_ENTRY_SIZE; - addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; addr = addr + index * entry_size; if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) { return -VTD_FR_PASID_TABLE_INV; @@ -732,25 +745,54 @@ static int vtd_get_pasid_entry(IntelIOMMUState *s, return 0; } -static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s, - dma_addr_t pasid_dir_base, - uint32_t pasid, - VTDPASIDEntry *pe) +/** + * Caller of this function should check present bit if wants + * to use pasid entry for futher usage except for fpd bit check. + */ +static int vtd_get_pe_from_pdire(IntelIOMMUState *s, + uint32_t pasid, + VTDPASIDDirEntry *pdire, + VTDPASIDEntry *pe) +{ + dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; + + return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe); +} + +/** + * This function gets a pasid entry from a specified pasid + * table (includes dir and leaf table) with a specified pasid. + * Sanity check should be done to ensure return a present + * pasid entry to caller. + */ +static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, + dma_addr_t pasid_dir_base, + uint32_t pasid, + VTDPASIDEntry *pe) { int ret; VTDPASIDDirEntry pdire; - ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); + ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, + pasid, &pdire); if (ret) { return ret; } - ret = vtd_get_pasid_entry(s, pasid, &pdire, pe); + if (!vtd_pdire_present(&pdire)) { + return -VTD_FR_PASID_TABLE_INV; + } + + ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe); if (ret) { return ret; } - return ret; + if (!vtd_pe_present(pe)) { + return -VTD_FR_PASID_TABLE_INV; + } + + return 0; } static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, @@ -763,7 +805,7 @@ static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, pasid = VTD_CE_GET_RID2PASID(ce); pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); - ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe); + ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe); return ret; } @@ -781,7 +823,11 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, pasid = VTD_CE_GET_RID2PASID(ce); pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); - ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); + /* + * No present bit check since fpd is meaningful even + * if the present bit is clear. + */ + ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire); if (ret) { return ret; } @@ -791,7 +837,15 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, return 0; } - ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe); + if (!vtd_pdire_present(&pdire)) { + return -VTD_FR_PASID_TABLE_INV; + } + + /* + * No present bit check since fpd is meaningful even + * if the present bit is clear. + */ + ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe); if (ret) { return ret; } diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index edcf9fc9bb..862033ebe6 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -479,6 +479,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */ /* PASID Granular Translation Type Mask */ +#define VTD_PASID_ENTRY_P 1ULL #define VTD_SM_PASID_ENTRY_PGTT (7ULL << 6) #define VTD_SM_PASID_ENTRY_FLT (1ULL << 6) #define VTD_SM_PASID_ENTRY_SLT (2ULL << 6)