From a7b4384fefbd1cbe05c8fd10fe06de8c4fdc2792 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 22 Jan 2020 02:20:36 -0500 Subject: [PATCH 01/30] bios-tables-test: tell people how to update For now just a pointer to the source file. Signed-off-by: Michael S. Tsirkin Reviewed-by: Laurent Vivier Reviewed-by: Thomas Huth --- tests/qtest/bios-tables-test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index b4752c644c..9c3c4680e4 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -426,7 +426,9 @@ static void test_acpi_asl(test_data *data) fprintf(stderr, "acpi-test: Warning! %.4s binary file mismatch. " - "Actual [aml:%s], Expected [aml:%s].\n", + "Actual [aml:%s], Expected [aml:%s].\n" + "See source file tests/qtest/bios-tables-test.c " + "for instructions on how to update expected files.\n", exp_sdt->aml, sdt->aml_file, exp_sdt->aml_file); all_tables_match = all_tables_match && From c01e905f3a34088346833f3a5518cb0efeb24e27 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 22 Jan 2020 02:21:18 -0500 Subject: [PATCH 02/30] bios-tables-test: fix up DIFF generation Turns out it goes to stdout which is suppressed even with V=1. Force DIFF output to stderr to make it visible. Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index 9c3c4680e4..1aed0ee7df 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -463,13 +463,19 @@ static void test_acpi_asl(test_data *data) "Actual [asl:%s, aml:%s], Expected [asl:%s, aml:%s].\n", exp_sdt->aml, sdt->asl_file, sdt->aml_file, exp_sdt->asl_file, exp_sdt->aml_file); + fflush(stderr); if (getenv("V")) { const char *diff_cmd = getenv("DIFF"); if (diff_cmd) { - int ret G_GNUC_UNUSED; char *diff = g_strdup_printf("%s %s %s", diff_cmd, exp_sdt->asl_file, sdt->asl_file); + int out = dup(STDOUT_FILENO); + int ret G_GNUC_UNUSED; + + dup2(STDERR_FILENO, STDOUT_FILENO); ret = system(diff) ; + dup2(out, STDOUT_FILENO); + close(out); g_free(diff); } else { fprintf(stderr, "acpi-test: Warning. not showing " From 34b1429ca92eae5bb2a532d5596f6150ea17150a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 22 Jan 2020 02:24:54 -0500 Subject: [PATCH 03/30] bios-tables-test: default diff command Most people probably just want diff -u. So let's use that as the default. Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index 1aed0ee7df..0a597bbacf 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -465,25 +465,18 @@ static void test_acpi_asl(test_data *data) exp_sdt->asl_file, exp_sdt->aml_file); fflush(stderr); if (getenv("V")) { - const char *diff_cmd = getenv("DIFF"); - if (diff_cmd) { - char *diff = g_strdup_printf("%s %s %s", diff_cmd, - exp_sdt->asl_file, sdt->asl_file); - int out = dup(STDOUT_FILENO); - int ret G_GNUC_UNUSED; + const char *diff_env = getenv("DIFF"); + const char *diff_cmd = diff_env ? diff_env : "diff -u"; + char *diff = g_strdup_printf("%s %s %s", diff_cmd, + exp_sdt->asl_file, sdt->asl_file); + int out = dup(STDOUT_FILENO); + int ret G_GNUC_UNUSED; - dup2(STDERR_FILENO, STDOUT_FILENO); - ret = system(diff) ; - dup2(out, STDOUT_FILENO); - close(out); - g_free(diff); - } else { - fprintf(stderr, "acpi-test: Warning. not showing " - "difference since no diff utility is specified. " - "Set 'DIFF' environment variable to a preferred " - "diff utility and run 'make V=1 check' again to " - "see ASL difference."); - } + dup2(STDERR_FILENO, STDOUT_FILENO); + ret = system(diff) ; + dup2(out, STDOUT_FILENO); + close(out); + g_free(diff); } } } From 6ab425d8a10226098c29971f74b5fa954d5498d6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 22 Jan 2020 03:02:43 -0500 Subject: [PATCH 04/30] rebuild-expected-aml.sh: remind about the process Remind users of rebuild-expected-aml.sh about the process to follow. Suppress the warning if allowed file list exists - that's a big hint user is already aware of the process. Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/rebuild-expected-aml.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/data/acpi/rebuild-expected-aml.sh b/tests/data/acpi/rebuild-expected-aml.sh index d44e511533..9cbaab1a4d 100755 --- a/tests/data/acpi/rebuild-expected-aml.sh +++ b/tests/data/acpi/rebuild-expected-aml.sh @@ -31,6 +31,13 @@ done eval `grep SRC_PATH= config-host.mak` +old_allowed_dif=`grep -v -e 'List of comma-separated changed AML files to ignore' ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h` + echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h echo "The files were rebuilt and can be added to git." + +if [ -z "$old_allowed_dif" ]; then + echo "Note! Please do not commit expected files with source changes" + echo "Note! Please follow the process documented in ${SRC_PATH}/tests/qtest/bios-tables-test.c" +fi From ba07cf5d3f58be697700b67d24e6277a7822b629 Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Tue, 25 Feb 2020 15:55:51 +0800 Subject: [PATCH 05/30] vhost-user-fs: do delete virtio_queues in unrealize Similar to other virtio device(https://patchwork.kernel.org/patch/11399237/), virtio queues forgot to delete in unrealize, and aslo error path in realize, this patch fix these memleaks, the leak stack is as follow: Direct leak of 57344 byte(s) in 1 object(s) allocated from: #0 0x7f15784fb970 in __interceptor_calloc (/lib64/libasan.so.5+0xef970) #1 0x7f157790849d in g_malloc0 (/lib64/libglib-2.0.so.0+0x5249d) #2 0x55587a1bf859 in virtio_add_queue /mnt/sdb/qemu-new/qemu_test/qemu/hw/virtio/virtio.c:2333 #3 0x55587a2071d5 in vuf_device_realize /mnt/sdb/qemu-new/qemu_test/qemu/hw/virtio/vhost-user-fs.c:212 #4 0x55587a1ae360 in virtio_device_realize /mnt/sdb/qemu-new/qemu_test/qemu/hw/virtio/virtio.c:3531 #5 0x55587a63fb7b in device_set_realized /mnt/sdb/qemu-new/qemu_test/qemu/hw/core/qdev.c:891 #6 0x55587acf03f5 in property_set_bool /mnt/sdb/qemu-new/qemu_test/qemu/qom/object.c:2238 #7 0x55587acfce0d in object_property_set_qobject /mnt/sdb/qemu-new/qemu_test/qemu/qom/qom-qobject.c:26 #8 0x55587acf5c8c in object_property_set_bool /mnt/sdb/qemu-new/qemu_test/qemu/qom/object.c:1390 #9 0x55587a8e22a2 in pci_qdev_realize /mnt/sdb/qemu-new/qemu_test/qemu/hw/pci/pci.c:2095 #10 0x55587a63fb7b in device_set_realized /mnt/sdb/qemu-new/qemu_test/qemu/hw/core/qdev.c:891 #11 0x55587acf03f5 in property_set_bool /mnt/sdb/qemu-new/qemu_test/qemu/qom/object.c:2238 #12 0x55587acfce0d in object_property_set_qobject /mnt/sdb/qemu-new/qemu_test/qemu/qom/qom-qobject.c:26 #13 0x55587acf5c8c in object_property_set_bool /mnt/sdb/qemu-new/qemu_test/qemu/qom/object.c:1390 #14 0x55587a496d65 in qdev_device_add /mnt/sdb/qemu-new/qemu_test/qemu/qdev-monitor.c:679 Reported-by: Euler Robot Signed-off-by: Pan Nengyuan Cc: "Dr. David Alan Gilbert" Cc: Stefan Hajnoczi Message-Id: <20200225075554.10835-2-pannengyuan@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user-fs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c index 33b17848c2..4554d123b7 100644 --- a/hw/virtio/vhost-user-fs.c +++ b/hw/virtio/vhost-user-fs.c @@ -230,6 +230,10 @@ static void vuf_device_realize(DeviceState *dev, Error **errp) err_virtio: vhost_user_cleanup(&fs->vhost_user); + virtio_del_queue(vdev, 0); + for (i = 0; i < fs->conf.num_request_queues; i++) { + virtio_del_queue(vdev, i + 1); + } virtio_cleanup(vdev); g_free(fs->vhost_dev.vqs); return; @@ -239,6 +243,7 @@ static void vuf_device_unrealize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VHostUserFS *fs = VHOST_USER_FS(dev); + int i; /* This will stop vhost backend if appropriate. */ vuf_set_status(vdev, 0); @@ -247,6 +252,10 @@ static void vuf_device_unrealize(DeviceState *dev, Error **errp) vhost_user_cleanup(&fs->vhost_user); + virtio_del_queue(vdev, 0); + for (i = 0; i < fs->conf.num_request_queues; i++) { + virtio_del_queue(vdev, i + 1); + } virtio_cleanup(vdev); g_free(fs->vhost_dev.vqs); fs->vhost_dev.vqs = NULL; From 2e5bc65935ddf8ee152c1317a3d3bb4bb73e727d Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Tue, 25 Feb 2020 15:55:52 +0800 Subject: [PATCH 06/30] vhost-user-fs: convert to the new virtio_delete_queue function use the new virtio_delete_queue function to cleanup. Signed-off-by: Pan Nengyuan Cc: "Dr. David Alan Gilbert" Cc: Stefan Hajnoczi Message-Id: <20200225075554.10835-3-pannengyuan@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user-fs.c | 15 +++++++++------ include/hw/virtio/vhost-user-fs.h | 2 ++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c index 4554d123b7..6136768875 100644 --- a/hw/virtio/vhost-user-fs.c +++ b/hw/virtio/vhost-user-fs.c @@ -209,11 +209,12 @@ static void vuf_device_realize(DeviceState *dev, Error **errp) sizeof(struct virtio_fs_config)); /* Hiprio queue */ - virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output); + fs->hiprio_vq = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output); /* Request queues */ + fs->req_vqs = g_new(VirtQueue *, fs->conf.num_request_queues); for (i = 0; i < fs->conf.num_request_queues; i++) { - virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output); + fs->req_vqs[i] = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output); } /* 1 high prio queue, plus the number configured */ @@ -230,10 +231,11 @@ static void vuf_device_realize(DeviceState *dev, Error **errp) err_virtio: vhost_user_cleanup(&fs->vhost_user); - virtio_del_queue(vdev, 0); + virtio_delete_queue(fs->hiprio_vq); for (i = 0; i < fs->conf.num_request_queues; i++) { - virtio_del_queue(vdev, i + 1); + virtio_delete_queue(fs->req_vqs[i]); } + g_free(fs->req_vqs); virtio_cleanup(vdev); g_free(fs->vhost_dev.vqs); return; @@ -252,10 +254,11 @@ static void vuf_device_unrealize(DeviceState *dev, Error **errp) vhost_user_cleanup(&fs->vhost_user); - virtio_del_queue(vdev, 0); + virtio_delete_queue(fs->hiprio_vq); for (i = 0; i < fs->conf.num_request_queues; i++) { - virtio_del_queue(vdev, i + 1); + virtio_delete_queue(fs->req_vqs[i]); } + g_free(fs->req_vqs); virtio_cleanup(vdev); g_free(fs->vhost_dev.vqs); fs->vhost_dev.vqs = NULL; diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h index 9ff1bdb7cf..6f3030d288 100644 --- a/include/hw/virtio/vhost-user-fs.h +++ b/include/hw/virtio/vhost-user-fs.h @@ -37,6 +37,8 @@ typedef struct { struct vhost_virtqueue *vhost_vqs; struct vhost_dev vhost_dev; VhostUserState vhost_user; + VirtQueue **req_vqs; + VirtQueue *hiprio_vq; /*< public >*/ } VHostUserFS; From 9861546e1dae05c5152de7d3bd14e341ecadc972 Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Tue, 25 Feb 2020 15:55:53 +0800 Subject: [PATCH 07/30] virtio-pmem: do delete rq_vq in virtio_pmem_unrealize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to other virtio-devices, rq_vq forgot to delete in virtio_pmem_unrealize, this patch fix it. This device has already maintained a vq pointer, thus we use the new virtio_delete_queue function directly to do the cleanup. Reported-by: Euler Robot Signed-off-by: Pan Nengyuan Message-Id: <20200225075554.10835-4-pannengyuan@huawei.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-pmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c index 97287e923b..43399522f5 100644 --- a/hw/virtio/virtio-pmem.c +++ b/hw/virtio/virtio-pmem.c @@ -130,6 +130,7 @@ static void virtio_pmem_unrealize(DeviceState *dev, Error **errp) VirtIOPMEM *pmem = VIRTIO_PMEM(dev); host_memory_backend_set_mapped(pmem->memdev, false); + virtio_delete_queue(pmem->rq_vq); virtio_cleanup(vdev); } From d56e1c8256cb37e68f9b5d98c6cc4e6ca463f1fd Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Tue, 25 Feb 2020 15:55:54 +0800 Subject: [PATCH 08/30] virtio-crypto: do delete ctrl_vq in virtio_crypto_device_unrealize Similar to other virtio-deivces, ctrl_vq forgot to delete in virtio_crypto_device_unrealize, this patch fix it. This device has aleardy maintained vq pointers. Thus, we use the new virtio_delete_queue function directly to do the cleanup. The leak stack: Direct leak of 10752 byte(s) in 3 object(s) allocated from: #0 0x7f4c024b1970 in __interceptor_calloc (/lib64/libasan.so.5+0xef970) #1 0x7f4c018be49d in g_malloc0 (/lib64/libglib-2.0.so.0+0x5249d) #2 0x55a2f8017279 in virtio_add_queue /mnt/sdb/qemu-new/qemu_test/qemu/hw/virtio/virtio.c:2333 #3 0x55a2f8057035 in virtio_crypto_device_realize /mnt/sdb/qemu-new/qemu_test/qemu/hw/virtio/virtio-crypto.c:814 #4 0x55a2f8005d80 in virtio_device_realize /mnt/sdb/qemu-new/qemu_test/qemu/hw/virtio/virtio.c:3531 #5 0x55a2f8497d1b in device_set_realized /mnt/sdb/qemu-new/qemu_test/qemu/hw/core/qdev.c:891 #6 0x55a2f8b48595 in property_set_bool /mnt/sdb/qemu-new/qemu_test/qemu/qom/object.c:2238 #7 0x55a2f8b54fad in object_property_set_qobject /mnt/sdb/qemu-new/qemu_test/qemu/qom/qom-qobject.c:26 #8 0x55a2f8b4de2c in object_property_set_bool /mnt/sdb/qemu-new/qemu_test/qemu/qom/object.c:1390 #9 0x55a2f80609c9 in virtio_crypto_pci_realize /mnt/sdb/qemu-new/qemu_test/qemu/hw/virtio/virtio-crypto-pci.c:58 Reported-by: Euler Robot Signed-off-by: Pan Nengyuan Cc: "Gonglei (Arei)" Message-Id: <20200225075554.10835-5-pannengyuan@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-crypto.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index 7351ab0a19..4c65114de5 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -831,12 +831,13 @@ static void virtio_crypto_device_unrealize(DeviceState *dev, Error **errp) max_queues = vcrypto->multiqueue ? vcrypto->max_queues : 1; for (i = 0; i < max_queues; i++) { - virtio_del_queue(vdev, i); + virtio_delete_queue(vcrypto->vqs[i].dataq); q = &vcrypto->vqs[i]; qemu_bh_delete(q->dataq_bh); } g_free(vcrypto->vqs); + virtio_delete_queue(vcrypto->ctrl_vq); virtio_cleanup(vdev); cryptodev_backend_set_used(vcrypto->cryptodev, false); From 13e5468127111bf44c5dc314d1dd2ec5a65dfec4 Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Mon, 24 Feb 2020 12:13:35 +0800 Subject: [PATCH 09/30] vhost-user-blk: delete virtioqueues in unrealize to fix memleaks virtio queues forgot to delete in unrealize, and aslo error path in realize, this patch fix these memleaks, the leak stack is as follow: Direct leak of 114688 byte(s) in 16 object(s) allocated from: #0 0x7f24024fdbf0 in calloc (/lib64/libasan.so.3+0xcabf0) #1 0x7f2401642015 in g_malloc0 (/lib64/libglib-2.0.so.0+0x50015) #2 0x55ad175a6447 in virtio_add_queue /mnt/sdb/qemu/hw/virtio/virtio.c:2327 #3 0x55ad17570cf9 in vhost_user_blk_device_realize /mnt/sdb/qemu/hw/block/vhost-user-blk.c:419 #4 0x55ad175a3707 in virtio_device_realize /mnt/sdb/qemu/hw/virtio/virtio.c:3509 #5 0x55ad176ad0d1 in device_set_realized /mnt/sdb/qemu/hw/core/qdev.c:876 #6 0x55ad1781ff9d in property_set_bool /mnt/sdb/qemu/qom/object.c:2080 #7 0x55ad178245ae in object_property_set_qobject /mnt/sdb/qemu/qom/qom-qobject.c:26 #8 0x55ad17821eb4 in object_property_set_bool /mnt/sdb/qemu/qom/object.c:1338 #9 0x55ad177aeed7 in virtio_pci_realize /mnt/sdb/qemu/hw/virtio/virtio-pci.c:1801 Reported-by: Euler Robot Signed-off-by: Pan Nengyuan Reviewed-by: Stefan Hajnoczi Message-Id: <20200224041336.30790-2-pannengyuan@huawei.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/block/vhost-user-blk.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index d8c459c575..2eba8b9db0 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -460,6 +460,9 @@ reconnect: virtio_err: g_free(s->vqs); g_free(s->inflight); + for (i = 0; i < s->num_queues; i++) { + virtio_del_queue(vdev, i); + } virtio_cleanup(vdev); vhost_user_cleanup(&s->vhost_user); } @@ -468,6 +471,7 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VHostUserBlk *s = VHOST_USER_BLK(dev); + int i; virtio_set_status(vdev, 0); qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, @@ -476,6 +480,10 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp) vhost_dev_free_inflight(s->inflight); g_free(s->vqs); g_free(s->inflight); + + for (i = 0; i < s->num_queues; i++) { + virtio_del_queue(vdev, i); + } virtio_cleanup(vdev); vhost_user_cleanup(&s->vhost_user); } From 38e245a42ca81c47d477314c3cfabfcb3d3cec11 Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Mon, 24 Feb 2020 12:13:36 +0800 Subject: [PATCH 10/30] vhost-user-blk: convert to new virtio_delete_queue use the new virtio_delete_queue function to cleanup. Signed-off-by: Pan Nengyuan Message-Id: <20200224041336.30790-3-pannengyuan@huawei.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/block/vhost-user-blk.c | 19 +++++++++++-------- include/hw/virtio/vhost-user-blk.h | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 2eba8b9db0..12925a47ec 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -306,7 +306,7 @@ static int vhost_user_blk_connect(DeviceState *dev) s->connected = true; s->dev.nvqs = s->num_queues; - s->dev.vqs = s->vqs; + s->dev.vqs = s->vhost_vqs; s->dev.vq_index = 0; s->dev.backend_features = 0; @@ -420,13 +420,14 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config)); + s->virtqs = g_new(VirtQueue *, s->num_queues); for (i = 0; i < s->num_queues; i++) { - virtio_add_queue(vdev, s->queue_size, - vhost_user_blk_handle_output); + s->virtqs[i] = virtio_add_queue(vdev, s->queue_size, + vhost_user_blk_handle_output); } s->inflight = g_new0(struct vhost_inflight, 1); - s->vqs = g_new0(struct vhost_virtqueue, s->num_queues); + s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues); s->watch = 0; s->connected = false; @@ -458,11 +459,12 @@ reconnect: return; virtio_err: - g_free(s->vqs); + g_free(s->vhost_vqs); g_free(s->inflight); for (i = 0; i < s->num_queues; i++) { - virtio_del_queue(vdev, i); + virtio_delete_queue(s->virtqs[i]); } + g_free(s->virtqs); virtio_cleanup(vdev); vhost_user_cleanup(&s->vhost_user); } @@ -478,12 +480,13 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp) NULL, NULL, NULL, false); vhost_dev_cleanup(&s->dev); vhost_dev_free_inflight(s->inflight); - g_free(s->vqs); + g_free(s->vhost_vqs); g_free(s->inflight); for (i = 0; i < s->num_queues; i++) { - virtio_del_queue(vdev, i); + virtio_delete_queue(s->virtqs[i]); } + g_free(s->virtqs); virtio_cleanup(vdev); vhost_user_cleanup(&s->vhost_user); } diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h index 108bfadeeb..05ea0ad183 100644 --- a/include/hw/virtio/vhost-user-blk.h +++ b/include/hw/virtio/vhost-user-blk.h @@ -36,7 +36,8 @@ typedef struct VHostUserBlk { struct vhost_dev dev; struct vhost_inflight *inflight; VhostUserState vhost_user; - struct vhost_virtqueue *vqs; + struct vhost_virtqueue *vhost_vqs; + VirtQueue **virtqs; guint watch; bool connected; } VHostUserBlk; From abdd16f4681cc4d6bf84990227b5c9b98e869ccd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 7 Feb 2020 10:46:19 +0000 Subject: [PATCH 11/30] virtio: gracefully handle invalid region caches The virtqueue code sets up MemoryRegionCaches to access the virtqueue guest RAM data structures. The code currently assumes that VRingMemoryRegionCaches is initialized before device emulation code accesses the virtqueue. An assertion will fail in vring_get_region_caches() when this is not true. Device fuzzing found a case where this assumption is false (see below). Virtqueue guest RAM addresses can also be changed from a vCPU thread while an IOThread is accessing the virtqueue. This breaks the same assumption but this time the caches could become invalid partway through the virtqueue code. The code fetches the caches RCU pointer multiple times so we will need to validate the pointer every time it is fetched. Add checks each time we call vring_get_region_caches() and treat invalid caches as a nop: memory stores are ignored and memory reads return 0. The fuzz test failure is as follows: $ qemu -M pc -device virtio-blk-pci,id=drv0,drive=drive0,addr=4.0 \ -drive if=none,id=drive0,file=null-co://,format=raw,auto-read-only=off \ -drive if=none,id=drive1,file=null-co://,file.read-zeroes=on,format=raw \ -display none \ -qtest stdio endianness outl 0xcf8 0x80002020 outl 0xcfc 0xe0000000 outl 0xcf8 0x80002004 outw 0xcfc 0x7 write 0xe0000000 0x24 0x00ffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffab5cffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffabffffffab0000000001 inb 0x4 writew 0xe000001c 0x1 write 0xe0000014 0x1 0x0d The following error message is produced: qemu-system-x86_64: /home/stefanha/qemu/hw/virtio/virtio.c:286: vring_get_region_caches: Assertion `caches != NULL' failed. The backtrace looks like this: #0 0x00007ffff5520625 in raise () at /lib64/libc.so.6 #1 0x00007ffff55098d9 in abort () at /lib64/libc.so.6 #2 0x00007ffff55097a9 in _nl_load_domain.cold () at /lib64/libc.so.6 #3 0x00007ffff5518a66 in annobin_assert.c_end () at /lib64/libc.so.6 #4 0x00005555559073da in vring_get_region_caches (vq=) at qemu/hw/virtio/virtio.c:286 #5 vring_get_region_caches (vq=) at qemu/hw/virtio/virtio.c:283 #6 0x000055555590818d in vring_used_flags_set_bit (mask=1, vq=0x5555575ceea0) at qemu/hw/virtio/virtio.c:398 #7 virtio_queue_split_set_notification (enable=0, vq=0x5555575ceea0) at qemu/hw/virtio/virtio.c:398 #8 virtio_queue_set_notification (vq=vq@entry=0x5555575ceea0, enable=enable@entry=0) at qemu/hw/virtio/virtio.c:451 #9 0x0000555555908512 in virtio_queue_set_notification (vq=vq@entry=0x5555575ceea0, enable=enable@entry=0) at qemu/hw/virtio/virtio.c:444 #10 0x00005555558c697a in virtio_blk_handle_vq (s=0x5555575c57e0, vq=0x5555575ceea0) at qemu/hw/block/virtio-blk.c:775 #11 0x0000555555907836 in virtio_queue_notify_aio_vq (vq=0x5555575ceea0) at qemu/hw/virtio/virtio.c:2244 #12 0x0000555555cb5dd7 in aio_dispatch_handlers (ctx=ctx@entry=0x55555671a420) at util/aio-posix.c:429 #13 0x0000555555cb67a8 in aio_dispatch (ctx=0x55555671a420) at util/aio-posix.c:460 #14 0x0000555555cb307e in aio_ctx_dispatch (source=, callback=, user_data=) at util/async.c:260 #15 0x00007ffff7bbc510 in g_main_context_dispatch () at /lib64/libglib-2.0.so.0 #16 0x0000555555cb5848 in glib_pollfds_poll () at util/main-loop.c:219 #17 os_host_main_loop_wait (timeout=) at util/main-loop.c:242 #18 main_loop_wait (nonblocking=) at util/main-loop.c:518 #19 0x00005555559b20c9 in main_loop () at vl.c:1683 #20 0x0000555555838115 in main (argc=, argv=, envp=) at vl.c:4441 Reported-by: Alexander Bulekov Cc: Michael Tsirkin Cc: Cornelia Huck Cc: Paolo Bonzini Cc: qemu-stable@nongnu.org Signed-off-by: Stefan Hajnoczi Message-Id: <20200207104619.164892-1-stefanha@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 99 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 91 insertions(+), 8 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 2c5410e981..00d444699d 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -282,15 +282,19 @@ static void vring_packed_flags_write(VirtIODevice *vdev, /* Called within rcu_read_lock(). */ static VRingMemoryRegionCaches *vring_get_region_caches(struct VirtQueue *vq) { - VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); - assert(caches != NULL); - return caches; + return atomic_rcu_read(&vq->vring.caches); } + /* Called within rcu_read_lock(). */ static inline uint16_t vring_avail_flags(VirtQueue *vq) { VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); hwaddr pa = offsetof(VRingAvail, flags); + + if (!caches) { + return 0; + } + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); } @@ -299,6 +303,11 @@ static inline uint16_t vring_avail_idx(VirtQueue *vq) { VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); hwaddr pa = offsetof(VRingAvail, idx); + + if (!caches) { + return 0; + } + vq->shadow_avail_idx = virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); return vq->shadow_avail_idx; } @@ -308,6 +317,11 @@ static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) { VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); hwaddr pa = offsetof(VRingAvail, ring[i]); + + if (!caches) { + return 0; + } + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); } @@ -323,6 +337,11 @@ static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem, { VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); hwaddr pa = offsetof(VRingUsed, ring[i]); + + if (!caches) { + return; + } + virtio_tswap32s(vq->vdev, &uelem->id); virtio_tswap32s(vq->vdev, &uelem->len); address_space_write_cached(&caches->used, pa, uelem, sizeof(VRingUsedElem)); @@ -334,6 +353,11 @@ static uint16_t vring_used_idx(VirtQueue *vq) { VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); hwaddr pa = offsetof(VRingUsed, idx); + + if (!caches) { + return 0; + } + return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); } @@ -342,8 +366,12 @@ static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) { VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); hwaddr pa = offsetof(VRingUsed, idx); - virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); - address_space_cache_invalidate(&caches->used, pa, sizeof(val)); + + if (caches) { + virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); + address_space_cache_invalidate(&caches->used, pa, sizeof(val)); + } + vq->used_idx = val; } @@ -353,8 +381,13 @@ static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); VirtIODevice *vdev = vq->vdev; hwaddr pa = offsetof(VRingUsed, flags); - uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + uint16_t flags; + if (!caches) { + return; + } + + flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); virtio_stw_phys_cached(vdev, &caches->used, pa, flags | mask); address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); } @@ -365,8 +398,13 @@ static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); VirtIODevice *vdev = vq->vdev; hwaddr pa = offsetof(VRingUsed, flags); - uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + uint16_t flags; + if (!caches) { + return; + } + + flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); virtio_stw_phys_cached(vdev, &caches->used, pa, flags & ~mask); address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); } @@ -381,6 +419,10 @@ static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val) } caches = vring_get_region_caches(vq); + if (!caches) { + return; + } + pa = offsetof(VRingUsed, ring[vq->vring.num]); virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); address_space_cache_invalidate(&caches->used, pa, sizeof(val)); @@ -410,7 +452,11 @@ static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable) VRingMemoryRegionCaches *caches; RCU_READ_LOCK_GUARD(); - caches = vring_get_region_caches(vq); + caches = vring_get_region_caches(vq); + if (!caches) { + return; + } + vring_packed_event_read(vq->vdev, &caches->used, &e); if (!enable) { @@ -597,6 +643,10 @@ static int virtio_queue_packed_empty_rcu(VirtQueue *vq) } cache = vring_get_region_caches(vq); + if (!cache) { + return 1; + } + vring_packed_desc_read_flags(vq->vdev, &desc.flags, &cache->desc, vq->last_avail_idx); @@ -777,6 +827,10 @@ static void virtqueue_packed_fill_desc(VirtQueue *vq, } caches = vring_get_region_caches(vq); + if (!caches) { + return; + } + vring_packed_desc_write(vq->vdev, &desc, &caches->desc, head, strict_order); } @@ -949,6 +1003,10 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq, max = vq->vring.num; caches = vring_get_region_caches(vq); + if (!caches) { + goto err; + } + while ((rc = virtqueue_num_heads(vq, idx)) > 0) { MemoryRegionCache *desc_cache = &caches->desc; unsigned int num_bufs; @@ -1089,6 +1147,9 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue *vq, max = vq->vring.num; caches = vring_get_region_caches(vq); + if (!caches) { + goto err; + } for (;;) { unsigned int num_bufs = total_bufs; @@ -1194,6 +1255,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, } caches = vring_get_region_caches(vq); + if (!caches) { + goto err; + } + desc_size = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED) ? sizeof(VRingPackedDesc) : sizeof(VRingDesc); if (caches->desc.len < vq->vring.num * desc_size) { @@ -1387,6 +1452,11 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) i = head; caches = vring_get_region_caches(vq); + if (!caches) { + virtio_error(vdev, "Region caches not initialized"); + goto done; + } + if (caches->desc.len < max * sizeof(VRingDesc)) { virtio_error(vdev, "Cannot map descriptor ring"); goto done; @@ -1509,6 +1579,11 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) i = vq->last_avail_idx; caches = vring_get_region_caches(vq); + if (!caches) { + virtio_error(vdev, "Region caches not initialized"); + goto done; + } + if (caches->desc.len < max * sizeof(VRingDesc)) { virtio_error(vdev, "Cannot map descriptor ring"); goto done; @@ -1628,6 +1703,10 @@ static unsigned int virtqueue_packed_drop_all(VirtQueue *vq) VRingPackedDesc desc; caches = vring_get_region_caches(vq); + if (!caches) { + return 0; + } + desc_cache = &caches->desc; virtio_queue_set_notification(vq, 0); @@ -2412,6 +2491,10 @@ static bool virtio_packed_should_notify(VirtIODevice *vdev, VirtQueue *vq) VRingMemoryRegionCaches *caches; caches = vring_get_region_caches(vq); + if (!caches) { + return false; + } + vring_packed_event_read(vdev, &caches->avail, &e); old = vq->signalled_used; From 22c37a10f3e1de1b19aa8281bc368d8cea960f4c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:36 +0100 Subject: [PATCH 12/30] virtio-iommu: Add skeleton This patchs adds the skeleton for the virtio-iommu device. Signed-off-by: Eric Auger Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-2-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Kconfig | 5 + hw/virtio/Makefile.objs | 1 + hw/virtio/trace-events | 7 + hw/virtio/virtio-iommu.c | 265 +++++++++++++++++++++++++++++++ include/hw/virtio/virtio-iommu.h | 57 +++++++ 5 files changed, 335 insertions(+) create mode 100644 hw/virtio/virtio-iommu.c create mode 100644 include/hw/virtio/virtio-iommu.h diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig index f87def27a6..d29525b36f 100644 --- a/hw/virtio/Kconfig +++ b/hw/virtio/Kconfig @@ -9,6 +9,11 @@ config VIRTIO_RNG default y depends on VIRTIO +config VIRTIO_IOMMU + bool + default y + depends on VIRTIO + config VIRTIO_PCI bool default y if PCI_DEVICES diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index de0f5fc39b..2fd9da7410 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -16,6 +16,7 @@ obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += virtio-crypto-p obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += virtio-pmem-pci.o obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-pci.o +obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o ifeq ($(CONFIG_VIRTIO_PCI),y) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index e28ba48da6..02d93d7f63 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -53,3 +53,10 @@ virtio_mmio_write_offset(uint64_t offset, uint64_t value) "virtio_mmio_write off virtio_mmio_guest_page(uint64_t size, int shift) "guest page size 0x%" PRIx64 " shift %d" virtio_mmio_queue_write(uint64_t value, int max_size) "mmio_queue write 0x%" PRIx64 " max %d" virtio_mmio_setting_irq(int level) "virtio_mmio setting IRQ %d" + +# hw/virtio/virtio-iommu.c +virtio_iommu_device_reset(void) "reset!" +virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 +virtio_iommu_device_status(uint8_t status) "driver status = %d" +virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" start=0x%"PRIx64" end=0x%"PRIx64" domain_range=%d probe_size=0x%x" +virtio_iommu_set_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x" diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c new file mode 100644 index 0000000000..30579267d5 --- /dev/null +++ b/hw/virtio/virtio-iommu.c @@ -0,0 +1,265 @@ +/* + * virtio-iommu device + * + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + */ + +#include "qemu/osdep.h" +#include "qemu/iov.h" +#include "qemu-common.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio.h" +#include "sysemu/kvm.h" +#include "trace.h" + +#include "standard-headers/linux/virtio_ids.h" + +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-access.h" +#include "hw/virtio/virtio-iommu.h" + +/* Max size */ +#define VIOMMU_DEFAULT_QUEUE_SIZE 256 + +static int virtio_iommu_handle_attach(VirtIOIOMMU *s, + struct iovec *iov, + unsigned int iov_cnt) +{ + return VIRTIO_IOMMU_S_UNSUPP; +} +static int virtio_iommu_handle_detach(VirtIOIOMMU *s, + struct iovec *iov, + unsigned int iov_cnt) +{ + return VIRTIO_IOMMU_S_UNSUPP; +} +static int virtio_iommu_handle_map(VirtIOIOMMU *s, + struct iovec *iov, + unsigned int iov_cnt) +{ + return VIRTIO_IOMMU_S_UNSUPP; +} +static int virtio_iommu_handle_unmap(VirtIOIOMMU *s, + struct iovec *iov, + unsigned int iov_cnt) +{ + return VIRTIO_IOMMU_S_UNSUPP; +} + +static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); + struct virtio_iommu_req_head head; + struct virtio_iommu_req_tail tail = {}; + VirtQueueElement *elem; + unsigned int iov_cnt; + struct iovec *iov; + size_t sz; + + for (;;) { + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + return; + } + + if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) || + iov_size(elem->out_sg, elem->out_num) < sizeof(head)) { + virtio_error(vdev, "virtio-iommu bad head/tail size"); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + break; + } + + iov_cnt = elem->out_num; + iov = elem->out_sg; + sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head)); + if (unlikely(sz != sizeof(head))) { + tail.status = VIRTIO_IOMMU_S_DEVERR; + goto out; + } + qemu_mutex_lock(&s->mutex); + switch (head.type) { + case VIRTIO_IOMMU_T_ATTACH: + tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt); + break; + case VIRTIO_IOMMU_T_DETACH: + tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt); + break; + case VIRTIO_IOMMU_T_MAP: + tail.status = virtio_iommu_handle_map(s, iov, iov_cnt); + break; + case VIRTIO_IOMMU_T_UNMAP: + tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt); + break; + default: + tail.status = VIRTIO_IOMMU_S_UNSUPP; + } + qemu_mutex_unlock(&s->mutex); + +out: + sz = iov_from_buf(elem->in_sg, elem->in_num, 0, + &tail, sizeof(tail)); + assert(sz == sizeof(tail)); + + virtqueue_push(vq, elem, sizeof(tail)); + virtio_notify(vdev, vq); + g_free(elem); + } +} + +static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data) +{ + VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev); + struct virtio_iommu_config *config = &dev->config; + + trace_virtio_iommu_get_config(config->page_size_mask, + config->input_range.start, + config->input_range.end, + config->domain_range.end, + config->probe_size); + memcpy(config_data, &dev->config, sizeof(struct virtio_iommu_config)); +} + +static void virtio_iommu_set_config(VirtIODevice *vdev, + const uint8_t *config_data) +{ + struct virtio_iommu_config config; + + memcpy(&config, config_data, sizeof(struct virtio_iommu_config)); + trace_virtio_iommu_set_config(config.page_size_mask, + config.input_range.start, + config.input_range.end, + config.domain_range.end, + config.probe_size); +} + +static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f, + Error **errp) +{ + VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev); + + f |= dev->features; + trace_virtio_iommu_get_features(f); + return f; +} + +/* + * Migration is not yet supported: most of the state consists + * of balanced binary trees which are not yet ready for getting + * migrated + */ +static const VMStateDescription vmstate_virtio_iommu_device = { + .name = "virtio-iommu-device", + .unmigratable = 1, +}; + +static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOIOMMU *s = VIRTIO_IOMMU(dev); + + virtio_init(vdev, "virtio-iommu", VIRTIO_ID_IOMMU, + sizeof(struct virtio_iommu_config)); + + s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, + virtio_iommu_handle_command); + s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL); + + s->config.page_size_mask = TARGET_PAGE_MASK; + s->config.input_range.end = -1UL; + s->config.domain_range.end = 32; + + virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX); + virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC); + virtio_add_feature(&s->features, VIRTIO_F_VERSION_1); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO); + + qemu_mutex_init(&s->mutex); +} + +static void virtio_iommu_device_unrealize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + + virtio_cleanup(vdev); +} + +static void virtio_iommu_device_reset(VirtIODevice *vdev) +{ + trace_virtio_iommu_device_reset(); +} + +static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status) +{ + trace_virtio_iommu_device_status(status); +} + +static void virtio_iommu_instance_init(Object *obj) +{ +} + +static const VMStateDescription vmstate_virtio_iommu = { + .name = "virtio-iommu", + .minimum_version_id = 1, + .version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, +}; + +static Property virtio_iommu_properties[] = { + DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, "PCI", PCIBus *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_iommu_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, virtio_iommu_properties); + dc->vmsd = &vmstate_virtio_iommu; + + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->realize = virtio_iommu_device_realize; + vdc->unrealize = virtio_iommu_device_unrealize; + vdc->reset = virtio_iommu_device_reset; + vdc->get_config = virtio_iommu_get_config; + vdc->set_config = virtio_iommu_set_config; + vdc->get_features = virtio_iommu_get_features; + vdc->set_status = virtio_iommu_set_status; + vdc->vmsd = &vmstate_virtio_iommu_device; +} + +static const TypeInfo virtio_iommu_info = { + .name = TYPE_VIRTIO_IOMMU, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOIOMMU), + .instance_init = virtio_iommu_instance_init, + .class_init = virtio_iommu_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_iommu_info); +} + +type_init(virtio_register_types) diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h new file mode 100644 index 0000000000..d24ba63305 --- /dev/null +++ b/include/hw/virtio/virtio-iommu.h @@ -0,0 +1,57 @@ +/* + * virtio-iommu device + * + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + */ + +#ifndef QEMU_VIRTIO_IOMMU_H +#define QEMU_VIRTIO_IOMMU_H + +#include "standard-headers/linux/virtio_iommu.h" +#include "hw/virtio/virtio.h" +#include "hw/pci/pci.h" + +#define TYPE_VIRTIO_IOMMU "virtio-iommu-device" +#define VIRTIO_IOMMU(obj) \ + OBJECT_CHECK(VirtIOIOMMU, (obj), TYPE_VIRTIO_IOMMU) + +typedef struct IOMMUDevice { + void *viommu; + PCIBus *bus; + int devfn; + IOMMUMemoryRegion iommu_mr; + AddressSpace as; +} IOMMUDevice; + +typedef struct IOMMUPciBus { + PCIBus *bus; + IOMMUDevice *pbdev[0]; /* Parent array is sparse, so dynamically alloc */ +} IOMMUPciBus; + +typedef struct VirtIOIOMMU { + VirtIODevice parent_obj; + VirtQueue *req_vq; + VirtQueue *event_vq; + struct virtio_iommu_config config; + uint64_t features; + GHashTable *as_by_busptr; + PCIBus *primary_bus; + GTree *domains; + QemuMutex mutex; + GTree *endpoints; +} VirtIOIOMMU; + +#endif From 5442b854eaf921588e24d5711640ab71e59cb1b8 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:37 +0100 Subject: [PATCH 13/30] virtio-iommu: Decode the command payload This patch adds the command payload decoding and introduces the functions that will do the actual command handling. Those functions are not yet implemented. Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-3-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 4 +++ hw/virtio/virtio-iommu.c | 76 +++++++++++++++++++++++++++++++++------- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 02d93d7f63..f7141aa2f6 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -60,3 +60,7 @@ virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx6 virtio_iommu_device_status(uint8_t status) "driver status = %d" virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" start=0x%"PRIx64" end=0x%"PRIx64" domain_range=%d probe_size=0x%x" virtio_iommu_set_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x" +virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" +virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" +virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d" +virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 30579267d5..86dcdc09a1 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -34,31 +34,83 @@ /* Max size */ #define VIOMMU_DEFAULT_QUEUE_SIZE 256 -static int virtio_iommu_handle_attach(VirtIOIOMMU *s, - struct iovec *iov, - unsigned int iov_cnt) +static int virtio_iommu_attach(VirtIOIOMMU *s, + struct virtio_iommu_req_attach *req) { + uint32_t domain_id = le32_to_cpu(req->domain); + uint32_t ep_id = le32_to_cpu(req->endpoint); + + trace_virtio_iommu_attach(domain_id, ep_id); + return VIRTIO_IOMMU_S_UNSUPP; } -static int virtio_iommu_handle_detach(VirtIOIOMMU *s, - struct iovec *iov, - unsigned int iov_cnt) + +static int virtio_iommu_detach(VirtIOIOMMU *s, + struct virtio_iommu_req_detach *req) { + uint32_t domain_id = le32_to_cpu(req->domain); + uint32_t ep_id = le32_to_cpu(req->endpoint); + + trace_virtio_iommu_detach(domain_id, ep_id); + return VIRTIO_IOMMU_S_UNSUPP; } -static int virtio_iommu_handle_map(VirtIOIOMMU *s, - struct iovec *iov, - unsigned int iov_cnt) + +static int virtio_iommu_map(VirtIOIOMMU *s, + struct virtio_iommu_req_map *req) { + uint32_t domain_id = le32_to_cpu(req->domain); + uint64_t phys_start = le64_to_cpu(req->phys_start); + uint64_t virt_start = le64_to_cpu(req->virt_start); + uint64_t virt_end = le64_to_cpu(req->virt_end); + uint32_t flags = le32_to_cpu(req->flags); + + trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags); + return VIRTIO_IOMMU_S_UNSUPP; } -static int virtio_iommu_handle_unmap(VirtIOIOMMU *s, - struct iovec *iov, - unsigned int iov_cnt) + +static int virtio_iommu_unmap(VirtIOIOMMU *s, + struct virtio_iommu_req_unmap *req) { + uint32_t domain_id = le32_to_cpu(req->domain); + uint64_t virt_start = le64_to_cpu(req->virt_start); + uint64_t virt_end = le64_to_cpu(req->virt_end); + + trace_virtio_iommu_unmap(domain_id, virt_start, virt_end); + return VIRTIO_IOMMU_S_UNSUPP; } +static int virtio_iommu_iov_to_req(struct iovec *iov, + unsigned int iov_cnt, + void *req, size_t req_sz) +{ + size_t sz, payload_sz = req_sz - sizeof(struct virtio_iommu_req_tail); + + sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz); + if (unlikely(sz != payload_sz)) { + return VIRTIO_IOMMU_S_INVAL; + } + return 0; +} + +#define virtio_iommu_handle_req(__req) \ +static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \ + struct iovec *iov, \ + unsigned int iov_cnt) \ +{ \ + struct virtio_iommu_req_ ## __req req; \ + int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); \ + \ + return ret ? ret : virtio_iommu_ ## __req(s, &req); \ +} + +virtio_iommu_handle_req(attach) +virtio_iommu_handle_req(detach) +virtio_iommu_handle_req(map) +virtio_iommu_handle_req(unmap) + static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq) { VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); From cfb42188b24d5904f82cd609185847a1168226e3 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:38 +0100 Subject: [PATCH 14/30] virtio-iommu: Implement attach/detach command This patch implements the endpoint attach/detach to/from a domain. Domain and endpoint internal datatypes are introduced. Both are stored in RB trees. The domain owns a list of endpoints attached to it. Also helpers to get/put end points and domains are introduced. As for the IOMMU memory regions, a callback is called on PCI bus enumeration that initializes for a given device on the bus hierarchy an IOMMU memory region. The PCI bus hierarchy is stored locally in IOMMUPciBus and IOMMUDevice objects. At the time of the enumeration, the bus number may not be computed yet. So operations that will need to retrieve the IOMMUdevice and its IOMMU memory region from the bus number and devfn, once the bus number is garanteed to be frozen, use an array of IOMMUPciBus, lazily populated. Signed-off-by: Eric Auger Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-4-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 6 + hw/virtio/virtio-iommu.c | 311 ++++++++++++++++++++++++++++++- include/hw/virtio/virtio-iommu.h | 3 + 3 files changed, 318 insertions(+), 2 deletions(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index f7141aa2f6..15595f8cd7 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -64,3 +64,9 @@ virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d" virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 +virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d" +virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s" +virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d" +virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d" +virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d" +virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d" diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 86dcdc09a1..d9fe83f530 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -23,6 +23,8 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio.h" #include "sysemu/kvm.h" +#include "qapi/error.h" +#include "qemu/error-report.h" #include "trace.h" #include "standard-headers/linux/virtio_ids.h" @@ -30,19 +32,236 @@ #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" #include "hw/virtio/virtio-iommu.h" +#include "hw/pci/pci_bus.h" +#include "hw/pci/pci.h" /* Max size */ #define VIOMMU_DEFAULT_QUEUE_SIZE 256 +typedef struct VirtIOIOMMUDomain { + uint32_t id; + GTree *mappings; + QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list; +} VirtIOIOMMUDomain; + +typedef struct VirtIOIOMMUEndpoint { + uint32_t id; + VirtIOIOMMUDomain *domain; + QLIST_ENTRY(VirtIOIOMMUEndpoint) next; +} VirtIOIOMMUEndpoint; + +typedef struct VirtIOIOMMUInterval { + uint64_t low; + uint64_t high; +} VirtIOIOMMUInterval; + +static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev) +{ + return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); +} + +/** + * The bus number is used for lookup when SID based operations occur. + * In that case we lazily populate the IOMMUPciBus array from the bus hash + * table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus + * numbers may not be always initialized yet. + */ +static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num) +{ + IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num]; + + if (!iommu_pci_bus) { + GHashTableIter iter; + + g_hash_table_iter_init(&iter, s->as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) { + if (pci_bus_num(iommu_pci_bus->bus) == bus_num) { + s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus; + return iommu_pci_bus; + } + } + return NULL; + } + return iommu_pci_bus; +} + +static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid) +{ + uint8_t bus_n, devfn; + IOMMUPciBus *iommu_pci_bus; + IOMMUDevice *dev; + + bus_n = PCI_BUS_NUM(sid); + iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n); + if (iommu_pci_bus) { + devfn = sid & PCI_DEVFN_MAX; + dev = iommu_pci_bus->pbdev[devfn]; + if (dev) { + return &dev->iommu_mr; + } + } + return NULL; +} + +static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data) +{ + VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a; + VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b; + + if (inta->high < intb->low) { + return -1; + } else if (intb->high < inta->low) { + return 1; + } else { + return 0; + } +} + +static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) +{ + if (!ep->domain) { + return; + } + QLIST_REMOVE(ep, next); + ep->domain = NULL; +} + +static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, + uint32_t ep_id) +{ + VirtIOIOMMUEndpoint *ep; + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id)); + if (ep) { + return ep; + } + if (!virtio_iommu_mr(s, ep_id)) { + return NULL; + } + ep = g_malloc0(sizeof(*ep)); + ep->id = ep_id; + trace_virtio_iommu_get_endpoint(ep_id); + g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep); + return ep; +} + +static void virtio_iommu_put_endpoint(gpointer data) +{ + VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data; + + if (ep->domain) { + virtio_iommu_detach_endpoint_from_domain(ep); + } + + trace_virtio_iommu_put_endpoint(ep->id); + g_free(ep); +} + +static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s, + uint32_t domain_id) +{ + VirtIOIOMMUDomain *domain; + + domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); + if (domain) { + return domain; + } + domain = g_malloc0(sizeof(*domain)); + domain->id = domain_id; + domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp, + NULL, (GDestroyNotify)g_free, + (GDestroyNotify)g_free); + g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain); + QLIST_INIT(&domain->endpoint_list); + trace_virtio_iommu_get_domain(domain_id); + return domain; +} + +static void virtio_iommu_put_domain(gpointer data) +{ + VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data; + VirtIOIOMMUEndpoint *iter, *tmp; + + QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) { + virtio_iommu_detach_endpoint_from_domain(iter); + } + g_tree_destroy(domain->mappings); + trace_virtio_iommu_put_domain(domain->id); + g_free(domain); +} + +static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, + int devfn) +{ + VirtIOIOMMU *s = opaque; + IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); + static uint32_t mr_index; + IOMMUDevice *sdev; + + if (!sbus) { + sbus = g_malloc0(sizeof(IOMMUPciBus) + + sizeof(IOMMUDevice *) * PCI_DEVFN_MAX); + sbus->bus = bus; + g_hash_table_insert(s->as_by_busptr, bus, sbus); + } + + sdev = sbus->pbdev[devfn]; + if (!sdev) { + char *name = g_strdup_printf("%s-%d-%d", + TYPE_VIRTIO_IOMMU_MEMORY_REGION, + mr_index++, devfn); + sdev = sbus->pbdev[devfn] = g_malloc0(sizeof(IOMMUDevice)); + + sdev->viommu = s; + sdev->bus = bus; + sdev->devfn = devfn; + + trace_virtio_iommu_init_iommu_mr(name); + + memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr), + TYPE_VIRTIO_IOMMU_MEMORY_REGION, + OBJECT(s), name, + UINT64_MAX); + address_space_init(&sdev->as, + MEMORY_REGION(&sdev->iommu_mr), TYPE_VIRTIO_IOMMU); + g_free(name); + } + return &sdev->as; +} + static int virtio_iommu_attach(VirtIOIOMMU *s, struct virtio_iommu_req_attach *req) { uint32_t domain_id = le32_to_cpu(req->domain); uint32_t ep_id = le32_to_cpu(req->endpoint); + VirtIOIOMMUDomain *domain; + VirtIOIOMMUEndpoint *ep; trace_virtio_iommu_attach(domain_id, ep_id); - return VIRTIO_IOMMU_S_UNSUPP; + ep = virtio_iommu_get_endpoint(s, ep_id); + if (!ep) { + return VIRTIO_IOMMU_S_NOENT; + } + + if (ep->domain) { + VirtIOIOMMUDomain *previous_domain = ep->domain; + /* + * the device is already attached to a domain, + * detach it first + */ + virtio_iommu_detach_endpoint_from_domain(ep); + if (QLIST_EMPTY(&previous_domain->endpoint_list)) { + g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id)); + } + } + + domain = virtio_iommu_get_domain(s, domain_id); + QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next); + + ep->domain = domain; + + return VIRTIO_IOMMU_S_OK; } static int virtio_iommu_detach(VirtIOIOMMU *s, @@ -50,10 +269,28 @@ static int virtio_iommu_detach(VirtIOIOMMU *s, { uint32_t domain_id = le32_to_cpu(req->domain); uint32_t ep_id = le32_to_cpu(req->endpoint); + VirtIOIOMMUDomain *domain; + VirtIOIOMMUEndpoint *ep; trace_virtio_iommu_detach(domain_id, ep_id); - return VIRTIO_IOMMU_S_UNSUPP; + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id)); + if (!ep) { + return VIRTIO_IOMMU_S_NOENT; + } + + domain = ep->domain; + + if (!domain || domain->id != domain_id) { + return VIRTIO_IOMMU_S_INVAL; + } + + virtio_iommu_detach_endpoint_from_domain(ep); + + if (QLIST_EMPTY(&domain->endpoint_list)) { + g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id)); + } + return VIRTIO_IOMMU_S_OK; } static int virtio_iommu_map(VirtIOIOMMU *s, @@ -172,6 +409,27 @@ out: } } +static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, + IOMMUAccessFlags flag, + int iommu_idx) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + uint32_t sid; + + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = addr, + .addr_mask = ~(hwaddr)0, + .perm = IOMMU_NONE, + }; + + sid = virtio_iommu_get_bdf(sdev); + + trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag); + return entry; +} + static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data) { VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev); @@ -218,6 +476,13 @@ static const VMStateDescription vmstate_virtio_iommu_device = { .unmigratable = 1, }; +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) +{ + guint ua = GPOINTER_TO_UINT(a); + guint ub = GPOINTER_TO_UINT(b); + return (ua > ub) - (ua < ub); +} + static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); @@ -226,6 +491,8 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) virtio_init(vdev, "virtio-iommu", VIRTIO_ID_IOMMU, sizeof(struct virtio_iommu_config)); + memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num)); + s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, virtio_iommu_handle_command); s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL); @@ -244,18 +511,43 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO); qemu_mutex_init(&s->mutex); + + s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free); + + if (s->primary_bus) { + pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s); + } else { + error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!"); + } } static void virtio_iommu_device_unrealize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOIOMMU *s = VIRTIO_IOMMU(dev); + + g_tree_destroy(s->domains); + g_tree_destroy(s->endpoints); virtio_cleanup(vdev); } static void virtio_iommu_device_reset(VirtIODevice *vdev) { + VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); + trace_virtio_iommu_device_reset(); + + if (s->domains) { + g_tree_destroy(s->domains); + } + if (s->endpoints) { + g_tree_destroy(s->endpoints); + } + s->domains = g_tree_new_full((GCompareDataFunc)int_cmp, + NULL, NULL, virtio_iommu_put_domain); + s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp, + NULL, NULL, virtio_iommu_put_endpoint); } static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status) @@ -301,6 +593,14 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) vdc->vmsd = &vmstate_virtio_iommu_device; } +static void virtio_iommu_memory_region_class_init(ObjectClass *klass, + void *data) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); + + imrc->translate = virtio_iommu_translate; +} + static const TypeInfo virtio_iommu_info = { .name = TYPE_VIRTIO_IOMMU, .parent = TYPE_VIRTIO_DEVICE, @@ -309,9 +609,16 @@ static const TypeInfo virtio_iommu_info = { .class_init = virtio_iommu_class_init, }; +static const TypeInfo virtio_iommu_memory_region_info = { + .parent = TYPE_IOMMU_MEMORY_REGION, + .name = TYPE_VIRTIO_IOMMU_MEMORY_REGION, + .class_init = virtio_iommu_memory_region_class_init, +}; + static void virtio_register_types(void) { type_register_static(&virtio_iommu_info); + type_register_static(&virtio_iommu_memory_region_info); } type_init(virtio_register_types) diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index d24ba63305..ae88f730cf 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -28,6 +28,8 @@ #define VIRTIO_IOMMU(obj) \ OBJECT_CHECK(VirtIOIOMMU, (obj), TYPE_VIRTIO_IOMMU) +#define TYPE_VIRTIO_IOMMU_MEMORY_REGION "virtio-iommu-memory-region" + typedef struct IOMMUDevice { void *viommu; PCIBus *bus; @@ -48,6 +50,7 @@ typedef struct VirtIOIOMMU { struct virtio_iommu_config config; uint64_t features; GHashTable *as_by_busptr; + IOMMUPciBus *iommu_pcibus_by_bus_num[PCI_BUS_MAX]; PCIBus *primary_bus; GTree *domains; QemuMutex mutex; From fe2cacae2438930eaafe8dd31b3bea1947ee0087 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:39 +0100 Subject: [PATCH 15/30] virtio-iommu: Implement map/unmap This patch implements virtio_iommu_map/unmap. Signed-off-by: Eric Auger Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-5-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 1 + hw/virtio/virtio-iommu.c | 63 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 15595f8cd7..22162d6583 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -64,6 +64,7 @@ virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d" virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 +virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d" virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s" virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d" diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index d9fe83f530..844d34c270 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -18,6 +18,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "qemu/iov.h" #include "qemu-common.h" #include "hw/qdev-properties.h" @@ -55,6 +56,11 @@ typedef struct VirtIOIOMMUInterval { uint64_t high; } VirtIOIOMMUInterval; +typedef struct VirtIOIOMMUMapping { + uint64_t phys_addr; + uint32_t flags; +} VirtIOIOMMUMapping; + static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev) { return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); @@ -301,10 +307,39 @@ static int virtio_iommu_map(VirtIOIOMMU *s, uint64_t virt_start = le64_to_cpu(req->virt_start); uint64_t virt_end = le64_to_cpu(req->virt_end); uint32_t flags = le32_to_cpu(req->flags); + VirtIOIOMMUDomain *domain; + VirtIOIOMMUInterval *interval; + VirtIOIOMMUMapping *mapping; + + if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) { + return VIRTIO_IOMMU_S_INVAL; + } + + domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); + if (!domain) { + return VIRTIO_IOMMU_S_NOENT; + } + + interval = g_malloc0(sizeof(*interval)); + + interval->low = virt_start; + interval->high = virt_end; + + mapping = g_tree_lookup(domain->mappings, (gpointer)interval); + if (mapping) { + g_free(interval); + return VIRTIO_IOMMU_S_INVAL; + } trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags); - return VIRTIO_IOMMU_S_UNSUPP; + mapping = g_malloc0(sizeof(*mapping)); + mapping->phys_addr = phys_start; + mapping->flags = flags; + + g_tree_insert(domain->mappings, interval, mapping); + + return VIRTIO_IOMMU_S_OK; } static int virtio_iommu_unmap(VirtIOIOMMU *s, @@ -313,10 +348,34 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, uint32_t domain_id = le32_to_cpu(req->domain); uint64_t virt_start = le64_to_cpu(req->virt_start); uint64_t virt_end = le64_to_cpu(req->virt_end); + VirtIOIOMMUMapping *iter_val; + VirtIOIOMMUInterval interval, *iter_key; + VirtIOIOMMUDomain *domain; + int ret = VIRTIO_IOMMU_S_OK; trace_virtio_iommu_unmap(domain_id, virt_start, virt_end); - return VIRTIO_IOMMU_S_UNSUPP; + domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); + if (!domain) { + return VIRTIO_IOMMU_S_NOENT; + } + interval.low = virt_start; + interval.high = virt_end; + + while (g_tree_lookup_extended(domain->mappings, &interval, + (void **)&iter_key, (void**)&iter_val)) { + uint64_t current_low = iter_key->low; + uint64_t current_high = iter_key->high; + + if (interval.low <= current_low && interval.high >= current_high) { + g_tree_remove(domain->mappings, iter_key); + trace_virtio_iommu_unmap_done(domain_id, current_low, current_high); + } else { + ret = VIRTIO_IOMMU_S_RANGE; + break; + } + } + return ret; } static int virtio_iommu_iov_to_req(struct iovec *iov, From ed8449b30b29f9da10746d3e5672248f3e4b675b Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:40 +0100 Subject: [PATCH 16/30] virtio-iommu: Implement translate This patch implements the translate callback Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-6-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 1 + hw/virtio/virtio-iommu.c | 60 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 22162d6583..095aa8b509 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -71,3 +71,4 @@ virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d" virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d" virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d" virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d" +virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d" diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 844d34c270..59e9cd3d9a 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -473,19 +473,77 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, int iommu_idx) { IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + VirtIOIOMMUInterval interval, *mapping_key; + VirtIOIOMMUMapping *mapping_value; + VirtIOIOMMU *s = sdev->viommu; + VirtIOIOMMUEndpoint *ep; + bool bypass_allowed; uint32_t sid; + bool found; + + interval.low = addr; + interval.high = addr + 1; IOMMUTLBEntry entry = { .target_as = &address_space_memory, .iova = addr, .translated_addr = addr, - .addr_mask = ~(hwaddr)0, + .addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1, .perm = IOMMU_NONE, }; + bypass_allowed = virtio_vdev_has_feature(&s->parent_obj, + VIRTIO_IOMMU_F_BYPASS); + sid = virtio_iommu_get_bdf(sdev); trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag); + qemu_mutex_lock(&s->mutex); + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); + if (!ep) { + if (!bypass_allowed) { + error_report_once("%s sid=%d is not known!!", __func__, sid); + } else { + entry.perm = flag; + } + goto unlock; + } + + if (!ep->domain) { + if (!bypass_allowed) { + error_report_once("%s %02x:%02x.%01x not attached to any domain", + __func__, PCI_BUS_NUM(sid), + PCI_SLOT(sid), PCI_FUNC(sid)); + } else { + entry.perm = flag; + } + goto unlock; + } + + found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval), + (void **)&mapping_key, + (void **)&mapping_value); + if (!found) { + error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d", + __func__, addr, sid); + goto unlock; + } + + if (((flag & IOMMU_RO) && + !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ)) || + ((flag & IOMMU_WO) && + !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE))) { + error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d", + __func__, addr, flag, mapping_value->flags); + goto unlock; + } + entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr; + entry.perm = flag; + trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid); + +unlock: + qemu_mutex_unlock(&s->mutex); return entry; } From a7c1da8adc4553854433d749c69319daa670ce69 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:41 +0100 Subject: [PATCH 17/30] virtio-iommu: Implement fault reporting The event queue allows to report asynchronous errors. The translate function now injects faults when relevant. Signed-off-by: Eric Auger Reviewed-by: Peter Xu Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-7-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 1 + hw/virtio/virtio-iommu.c | 70 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 095aa8b509..e83500bee9 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -72,3 +72,4 @@ virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d" virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d" virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d" virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d" +virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64 diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 59e9cd3d9a..8509f64004 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -468,6 +468,48 @@ out: } } +static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason, + int flags, uint32_t endpoint, + uint64_t address) +{ + VirtIODevice *vdev = &viommu->parent_obj; + VirtQueue *vq = viommu->event_vq; + struct virtio_iommu_fault fault; + VirtQueueElement *elem; + size_t sz; + + memset(&fault, 0, sizeof(fault)); + fault.reason = reason; + fault.flags = cpu_to_le32(flags); + fault.endpoint = cpu_to_le32(endpoint); + fault.address = cpu_to_le64(address); + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + + if (!elem) { + error_report_once( + "no buffer available in event queue to report event"); + return; + } + + if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) { + virtio_error(vdev, "error buffer of wrong size"); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + return; + } + + sz = iov_from_buf(elem->in_sg, elem->in_num, 0, + &fault, sizeof(fault)); + assert(sz == sizeof(fault)); + + trace_virtio_iommu_report_fault(reason, flags, endpoint, address); + virtqueue_push(vq, elem, sz); + virtio_notify(vdev, vq); + g_free(elem); + +} + static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, IOMMUAccessFlags flag, int iommu_idx) @@ -476,9 +518,10 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, VirtIOIOMMUInterval interval, *mapping_key; VirtIOIOMMUMapping *mapping_value; VirtIOIOMMU *s = sdev->viommu; + bool read_fault, write_fault; VirtIOIOMMUEndpoint *ep; + uint32_t sid, flags; bool bypass_allowed; - uint32_t sid; bool found; interval.low = addr; @@ -504,6 +547,9 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, if (!ep) { if (!bypass_allowed) { error_report_once("%s sid=%d is not known!!", __func__, sid); + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN, + VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); } else { entry.perm = flag; } @@ -515,6 +561,9 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, error_report_once("%s %02x:%02x.%01x not attached to any domain", __func__, PCI_BUS_NUM(sid), PCI_SLOT(sid), PCI_FUNC(sid)); + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN, + VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); } else { entry.perm = flag; } @@ -527,15 +576,26 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, if (!found) { error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d", __func__, addr, sid); + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING, + VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); goto unlock; } - if (((flag & IOMMU_RO) && - !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ)) || - ((flag & IOMMU_WO) && - !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE))) { + read_fault = (flag & IOMMU_RO) && + !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ); + write_fault = (flag & IOMMU_WO) && + !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE); + + flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0; + flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0; + if (flags) { error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d", __func__, addr, flag, mapping_value->flags); + flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS; + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING, + flags | VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); goto unlock; } entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr; From bd0ab8702fa272d39644bbefa1d3f4b42fc594ff Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:42 +0100 Subject: [PATCH 18/30] virtio-iommu: Support migration Add Migration support. We rely on recently added gtree and qlist migration. We only migrate the domain gtree. The endpoint gtree is re-constructed in a post-load operation. Signed-off-by: Eric Auger Acked-by: Peter Xu Reviewed-by: Juan Quintela Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-8-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 109 +++++++++++++++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 10 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 8509f64004..4cee8083bc 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -643,16 +643,6 @@ static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f, return f; } -/* - * Migration is not yet supported: most of the state consists - * of balanced binary trees which are not yet ready for getting - * migrated - */ -static const VMStateDescription vmstate_virtio_iommu_device = { - .name = "virtio-iommu-device", - .unmigratable = 1, -}; - static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) { guint ua = GPOINTER_TO_UINT(a); @@ -736,9 +726,108 @@ static void virtio_iommu_instance_init(Object *obj) { } +#define VMSTATE_INTERVAL \ +{ \ + .name = "interval", \ + .version_id = 1, \ + .minimum_version_id = 1, \ + .fields = (VMStateField[]) { \ + VMSTATE_UINT64(low, VirtIOIOMMUInterval), \ + VMSTATE_UINT64(high, VirtIOIOMMUInterval), \ + VMSTATE_END_OF_LIST() \ + } \ +} + +#define VMSTATE_MAPPING \ +{ \ + .name = "mapping", \ + .version_id = 1, \ + .minimum_version_id = 1, \ + .fields = (VMStateField[]) { \ + VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\ + VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \ + VMSTATE_END_OF_LIST() \ + }, \ +} + +static const VMStateDescription vmstate_interval_mapping[2] = { + VMSTATE_MAPPING, /* value */ + VMSTATE_INTERVAL /* key */ +}; + +static int domain_preload(void *opaque) +{ + VirtIOIOMMUDomain *domain = opaque; + + domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp, + NULL, g_free, g_free); + return 0; +} + +static const VMStateDescription vmstate_endpoint = { + .name = "endpoint", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(id, VirtIOIOMMUEndpoint), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_domain = { + .name = "domain", + .version_id = 1, + .minimum_version_id = 1, + .pre_load = domain_preload, + .fields = (VMStateField[]) { + VMSTATE_UINT32(id, VirtIOIOMMUDomain), + VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1, + vmstate_interval_mapping, + VirtIOIOMMUInterval, VirtIOIOMMUMapping), + VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1, + vmstate_endpoint, VirtIOIOMMUEndpoint, next), + VMSTATE_END_OF_LIST() + } +}; + +static gboolean reconstruct_endpoints(gpointer key, gpointer value, + gpointer data) +{ + VirtIOIOMMU *s = (VirtIOIOMMU *)data; + VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value; + VirtIOIOMMUEndpoint *iter; + + QLIST_FOREACH(iter, &d->endpoint_list, next) { + iter->domain = d; + g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter); + } + return false; /* continue the domain traversal */ +} + +static int iommu_post_load(void *opaque, int version_id) +{ + VirtIOIOMMU *s = opaque; + + g_tree_foreach(s->domains, reconstruct_endpoints, s); + return 0; +} + +static const VMStateDescription vmstate_virtio_iommu_device = { + .name = "virtio-iommu-device", + .minimum_version_id = 1, + .version_id = 1, + .post_load = iommu_post_load, + .fields = (VMStateField[]) { + VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 1, + &vmstate_domain, VirtIOIOMMUDomain), + VMSTATE_END_OF_LIST() + }, +}; + static const VMStateDescription vmstate_virtio_iommu = { .name = "virtio-iommu", .minimum_version_id = 1, + .priority = MIG_PRI_IOMMU, .version_id = 1, .fields = (VMStateField[]) { VMSTATE_VIRTIO_DEVICE, From 8b4eb09e32456ebd769fe09a1aa338d6a3536376 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:43 +0100 Subject: [PATCH 19/30] virtio-iommu-pci: Add virtio iommu pci support This patch adds virtio-iommu-pci, which is the pci proxy for the virtio-iommu device. Currently non DT integration is not yet supported by the kernel. So the machine must implement a hotplug handler for the virtio-iommu-pci device that creates the device tree iommu-map bindings as documented in kernel documentation: Documentation/devicetree/bindings/virtio/iommu.txt Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker Reviewed-by: Michael S. Tsirkin Message-Id: <20200214132745.23392-9-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Makefile.objs | 1 + hw/virtio/virtio-iommu-pci.c | 104 +++++++++++++++++++++++++++++++ include/hw/pci/pci.h | 1 + include/hw/virtio/virtio-iommu.h | 1 + qdev-monitor.c | 1 + 5 files changed, 108 insertions(+) create mode 100644 hw/virtio/virtio-iommu-pci.c diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index 2fd9da7410..4e4d39a0a4 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -29,6 +29,7 @@ obj-$(CONFIG_VIRTIO_INPUT_HOST) += virtio-input-host-pci.o obj-$(CONFIG_VIRTIO_INPUT) += virtio-input-pci.o obj-$(CONFIG_VIRTIO_RNG) += virtio-rng-pci.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio-balloon-pci.o +obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu-pci.o obj-$(CONFIG_VIRTIO_9P) += virtio-9p-pci.o obj-$(CONFIG_VIRTIO_SCSI) += virtio-scsi-pci.o obj-$(CONFIG_VIRTIO_BLK) += virtio-blk-pci.o diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c new file mode 100644 index 0000000000..3dfbf55b47 --- /dev/null +++ b/hw/virtio/virtio-iommu-pci.c @@ -0,0 +1,104 @@ +/* + * Virtio IOMMU PCI Bindings + * + * Copyright (c) 2019 Red Hat, Inc. + * Written by Eric Auger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 or + * (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "virtio-pci.h" +#include "hw/virtio/virtio-iommu.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "hw/boards.h" + +typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI; + +/* + * virtio-iommu-pci: This extends VirtioPCIProxy. + * + */ +#define VIRTIO_IOMMU_PCI(obj) \ + OBJECT_CHECK(VirtIOIOMMUPCI, (obj), TYPE_VIRTIO_IOMMU_PCI) + +struct VirtIOIOMMUPCI { + VirtIOPCIProxy parent_obj; + VirtIOIOMMU vdev; +}; + +static Property virtio_iommu_pci_properties[] = { + DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + if (!qdev_get_machine_hotplug_handler(DEVICE(vpci_dev))) { + MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); + + error_setg(errp, + "%s machine fails to create iommu-map device tree bindings", + mc->name); + error_append_hint(errp, + "Check you machine implements a hotplug handler " + "for the virtio-iommu-pci device\n"); + error_append_hint(errp, "Check the guest is booted without FW or with " + "-no-acpi\n"); + return; + } + qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); + object_property_set_link(OBJECT(dev), + OBJECT(pci_get_bus(&vpci_dev->pci_dev)), + "primary-bus", errp); + object_property_set_bool(OBJECT(vdev), true, "realized", errp); +} + +static void virtio_iommu_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = virtio_iommu_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, virtio_iommu_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_IOMMU; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_OTHERS; + dc->hotpluggable = false; +} + +static void virtio_iommu_pci_instance_init(Object *obj) +{ + VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_IOMMU); +} + +static const VirtioPCIDeviceTypeInfo virtio_iommu_pci_info = { + .base_name = TYPE_VIRTIO_IOMMU_PCI, + .generic_name = "virtio-iommu-pci", + .transitional_name = "virtio-iommu-pci-transitional", + .non_transitional_name = "virtio-iommu-pci-non-transitional", + .instance_size = sizeof(VirtIOIOMMUPCI), + .instance_init = virtio_iommu_pci_instance_init, + .class_init = virtio_iommu_pci_class_init, +}; + +static void virtio_iommu_pci_register(void) +{ + virtio_pci_types_register(&virtio_iommu_pci_info); +} + +type_init(virtio_iommu_pci_register) + + diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 2acd8321af..cfedf5a995 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -86,6 +86,7 @@ extern bool pci_available; #define PCI_DEVICE_ID_VIRTIO_9P 0x1009 #define PCI_DEVICE_ID_VIRTIO_VSOCK 0x1012 #define PCI_DEVICE_ID_VIRTIO_PMEM 0x1013 +#define PCI_DEVICE_ID_VIRTIO_IOMMU 0x1014 #define PCI_VENDOR_ID_REDHAT 0x1b36 #define PCI_DEVICE_ID_REDHAT_BRIDGE 0x0001 diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index ae88f730cf..6f67f1020a 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -25,6 +25,7 @@ #include "hw/pci/pci.h" #define TYPE_VIRTIO_IOMMU "virtio-iommu-device" +#define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-device-base" #define VIRTIO_IOMMU(obj) \ OBJECT_CHECK(VirtIOIOMMU, (obj), TYPE_VIRTIO_IOMMU) diff --git a/qdev-monitor.c b/qdev-monitor.c index 8ce71a206b..dbbe92dfa1 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -67,6 +67,7 @@ static const QDevAlias qdev_alias_table[] = { { "virtio-input-host-ccw", "virtio-input-host", QEMU_ARCH_S390X }, { "virtio-input-host-pci", "virtio-input-host", QEMU_ARCH_ALL & ~QEMU_ARCH_S390X }, + { "virtio-iommu-pci", "virtio-iommu", QEMU_ARCH_ALL & ~QEMU_ARCH_S390X }, { "virtio-keyboard-ccw", "virtio-keyboard", QEMU_ARCH_S390X }, { "virtio-keyboard-pci", "virtio-keyboard", QEMU_ARCH_ALL & ~QEMU_ARCH_S390X }, From 70e89132c9e67473b7cbb507dd13c96aab598517 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:44 +0100 Subject: [PATCH 20/30] hw/arm/virt: Add the virtio-iommu device tree mappings Adds the "virtio,pci-iommu" node in the host bridge node and the RID mapping, excluding the IOMMU RID. This is done in the virtio-iommu-pci hotplug handler which gets called only if no firmware is loaded or if -no-acpi is passed on the command line. As non DT integration is not yet supported by the kernel we must make sure we are in DT mode. This limitation will be removed as soon as the topology description feature gets supported. Signed-off-by: Eric Auger Message-Id: <20200214132745.23392-10-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Peter Maydell --- hw/arm/virt.c | 57 +++++++++++++++++++++++++++++++++++++------ include/hw/arm/virt.h | 2 ++ 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index f788fe27d6..cfe317922f 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -32,6 +32,7 @@ #include "qemu-common.h" #include "qemu/units.h" #include "qemu/option.h" +#include "monitor/qdev.h" #include "qapi/error.h" #include "hw/sysbus.h" #include "hw/boards.h" @@ -54,6 +55,7 @@ #include "qemu/error-report.h" #include "qemu/module.h" #include "hw/pci-host/gpex.h" +#include "hw/virtio/virtio-pci.h" #include "hw/arm/sysbus-fdt.h" #include "hw/platform-bus.h" #include "hw/qdev-properties.h" @@ -71,6 +73,7 @@ #include "hw/mem/pc-dimm.h" #include "hw/mem/nvdimm.h" #include "hw/acpi/generic_event_device.h" +#include "hw/virtio/virtio-iommu.h" #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ @@ -1180,6 +1183,30 @@ static void create_smmu(const VirtMachineState *vms, g_free(node); } +static void create_virtio_iommu_dt_bindings(VirtMachineState *vms, Error **errp) +{ + const char compat[] = "virtio,pci-iommu"; + uint16_t bdf = vms->virtio_iommu_bdf; + char *node; + + vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt); + + node = g_strdup_printf("%s/virtio_iommu@%d", vms->pciehb_nodename, bdf); + qemu_fdt_add_subnode(vms->fdt, node); + qemu_fdt_setprop(vms->fdt, node, "compatible", compat, sizeof(compat)); + qemu_fdt_setprop_sized_cells(vms->fdt, node, "reg", + 1, bdf << 8, 1, 0, 1, 0, + 1, 0, 1, 0); + + qemu_fdt_setprop_cell(vms->fdt, node, "#iommu-cells", 1); + qemu_fdt_setprop_cell(vms->fdt, node, "phandle", vms->iommu_phandle); + g_free(node); + + qemu_fdt_setprop_cells(vms->fdt, vms->pciehb_nodename, "iommu-map", + 0x0, vms->iommu_phandle, 0x0, bdf, + bdf + 1, vms->iommu_phandle, bdf + 1, 0xffff - bdf); +} + static void create_pcie(VirtMachineState *vms) { hwaddr base_mmio = vms->memmap[VIRT_PCIE_MMIO].base; @@ -1258,7 +1285,7 @@ static void create_pcie(VirtMachineState *vms) } } - nodename = g_strdup_printf("/pcie@%" PRIx64, base); + nodename = vms->pciehb_nodename = g_strdup_printf("/pcie@%" PRIx64, base); qemu_fdt_add_subnode(vms->fdt, nodename); qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "pci-host-ecam-generic"); @@ -1301,13 +1328,16 @@ static void create_pcie(VirtMachineState *vms) if (vms->iommu) { vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt); - create_smmu(vms, pci->bus); - - qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map", - 0x0, vms->iommu_phandle, 0x0, 0x10000); + switch (vms->iommu) { + case VIRT_IOMMU_SMMUV3: + create_smmu(vms, pci->bus); + qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map", + 0x0, vms->iommu_phandle, 0x0, 0x10000); + break; + default: + g_assert_not_reached(); + } } - - g_free(nodename); } static void create_platform_bus(VirtMachineState *vms) @@ -1976,6 +2006,13 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { virt_memory_plug(hotplug_dev, dev, errp); } + if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { + PCIDevice *pdev = PCI_DEVICE(dev); + + vms->iommu = VIRT_IOMMU_VIRTIO; + vms->virtio_iommu_bdf = pci_get_bdf(pdev); + create_virtio_iommu_dt_bindings(vms, errp); + } } static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev, @@ -1992,7 +2029,13 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM))) { return HOTPLUG_HANDLER(machine); } + if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { + VirtMachineState *vms = VIRT_MACHINE(machine); + if (!vms->bootinfo.firmware_loaded || !acpi_enabled) { + return HOTPLUG_HANDLER(machine); + } + } return NULL; } diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 71508bf40c..02f500cb8e 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -125,8 +125,10 @@ typedef struct { bool virt; int32_t gic_version; VirtIOMMUType iommu; + uint16_t virtio_iommu_bdf; struct arm_boot_info bootinfo; MemMapEntry *memmap; + char *pciehb_nodename; const int *irqmap; int smp_cpus; void *fdt; From c1dee918799391980dbde12ee60fbd0189f367f8 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Feb 2020 14:27:45 +0100 Subject: [PATCH 21/30] MAINTAINERS: add virtio-iommu related files Add a new "virtio-iommu" section with the new files related to this device. Signed-off-by: Eric Auger Message-Id: <20200214132745.23392-11-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Peter Maydell --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 195dd58cac..ff06934bb5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1631,6 +1631,12 @@ F: hw/input/virtio-input*.c F: include/hw/virtio/virtio-input.h F: contrib/vhost-user-input/* +virtio-iommu +M: Eric Auger +S: Maintained +F: hw/virtio/virtio-iommu*.c +F: include/hw/virtio/virtio-iommu.h + virtio-serial M: Laurent Vivier R: Amit Shah From 8899d60142e790debe1604a89d0028ba1740d8e9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Jan 2020 09:17:03 +0100 Subject: [PATCH 22/30] libvhost-user: implement VHOST_USER_PROTOCOL_F_REPLY_ACK This is really simple, since we know whether a response is already requested or not, so we can just send a (successful) response when there isn't one already. Given that, it's not all _that_ useful but the master can at least be sure the message was processed, and we can exercise more code paths using the example code. Reviewed-by: Stefan Hajnoczi Signed-off-by: Johannes Berg Message-Id: <20200123081708.7817-2-johannes@sipsolutions.net> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index b89bf18501..533d55d82a 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -1199,7 +1199,8 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | - 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD; + 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | + 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; if (have_userfault()) { features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; @@ -1581,13 +1582,20 @@ vu_dispatch(VuDev *dev) { VhostUserMsg vmsg = { 0, }; int reply_requested; - bool success = false; + bool need_reply, success = false; if (!vu_message_read(dev, dev->sock, &vmsg)) { goto end; } + need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK; + reply_requested = vu_process_message(dev, &vmsg); + if (!reply_requested && need_reply) { + vmsg_set_reply_u64(&vmsg, 0); + reply_requested = 1; + } + if (!reply_requested) { success = true; goto end; From a7290a79fa262124916dab2bb75188cfd07faad6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Jan 2020 09:17:04 +0100 Subject: [PATCH 23/30] libvhost-user-glib: fix VugDev main fd cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If you try to make a device implementation that can handle multiple connections and allow disconnections (which requires overriding the VHOST_USER_NONE handling), then glib will warn that we remove a src while it's still on the mainloop, and will poll() an FD that doesn't exist anymore. Fix this by making vug_source_new() require pairing with the new vug_source_destroy() so we can keep the GSource referenced in the meantime. Note that this requires calling the new API in vhost-user-input. vhost-user-gpu also uses vug_source_new(), but never seems to free the result at all, so I haven't changed anything there. Fixes: 8bb7ddb78a1c ("libvhost-user: add glib source helper") Reviewed-by: Marc-André Lureau Signed-off-by: Johannes Berg Message-Id: <20200123081708.7817-3-johannes@sipsolutions.net> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user-glib.c | 15 ++++++++++++--- contrib/libvhost-user/libvhost-user-glib.h | 1 + contrib/vhost-user-input/main.c | 6 ++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/contrib/libvhost-user/libvhost-user-glib.c b/contrib/libvhost-user/libvhost-user-glib.c index 99edd2f3de..824c7780de 100644 --- a/contrib/libvhost-user/libvhost-user-glib.c +++ b/contrib/libvhost-user/libvhost-user-glib.c @@ -91,7 +91,6 @@ vug_source_new(VugDev *gdev, int fd, GIOCondition cond, g_source_add_poll(gsrc, &src->gfd); id = g_source_attach(gsrc, NULL); g_assert(id); - g_source_unref(gsrc); return gsrc; } @@ -131,6 +130,16 @@ static void vug_watch(VuDev *dev, int condition, void *data) } } +void vug_source_destroy(GSource *src) +{ + if (!src) { + return; + } + + g_source_destroy(src); + g_source_unref(src); +} + bool vug_init(VugDev *dev, uint16_t max_queues, int socket, vu_panic_cb panic, const VuDevIface *iface) @@ -144,7 +153,7 @@ vug_init(VugDev *dev, uint16_t max_queues, int socket, } dev->fdmap = g_hash_table_new_full(NULL, NULL, NULL, - (GDestroyNotify) g_source_destroy); + (GDestroyNotify) vug_source_destroy); dev->src = vug_source_new(dev, socket, G_IO_IN, vug_watch, NULL); @@ -157,5 +166,5 @@ vug_deinit(VugDev *dev) g_assert(dev); g_hash_table_unref(dev->fdmap); - g_source_unref(dev->src); + vug_source_destroy(dev->src); } diff --git a/contrib/libvhost-user/libvhost-user-glib.h b/contrib/libvhost-user/libvhost-user-glib.h index 64d539d93a..1a79a4916e 100644 --- a/contrib/libvhost-user/libvhost-user-glib.h +++ b/contrib/libvhost-user/libvhost-user-glib.h @@ -31,5 +31,6 @@ void vug_deinit(VugDev *dev); GSource *vug_source_new(VugDev *dev, int fd, GIOCondition cond, vu_watch_cb vu_cb, gpointer data); +void vug_source_destroy(GSource *src); #endif /* LIBVHOST_USER_GLIB_H */ diff --git a/contrib/vhost-user-input/main.c b/contrib/vhost-user-input/main.c index ef4b7769f2..6020c6f33a 100644 --- a/contrib/vhost-user-input/main.c +++ b/contrib/vhost-user-input/main.c @@ -187,7 +187,7 @@ vi_queue_set_started(VuDev *dev, int qidx, bool started) } if (!started && vi->evsrc) { - g_source_destroy(vi->evsrc); + vug_source_destroy(vi->evsrc); vi->evsrc = NULL; } } @@ -401,9 +401,7 @@ main(int argc, char *argv[]) vug_deinit(&vi.dev); - if (vi.evsrc) { - g_source_unref(vi.evsrc); - } + vug_source_destroy(vi.evsrc); g_array_free(vi.config, TRUE); g_free(vi.queue); return 0; From a00fdc9c9d3e7f2c3c21d14ba8fd04451ec423a2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Jan 2020 09:17:05 +0100 Subject: [PATCH 24/30] libvhost-user-glib: use g_main_context_get_thread_default() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we use NULL, we just get the main program default mainloop here. Using g_main_context_get_thread_default() has basically the same effect, but it lets us start different devices in different threads with different mainloops, which can be useful. Reviewed-by: Stefan Hajnoczi Reviewed-by: Marc-André Lureau Signed-off-by: Johannes Berg Message-Id: <20200123081708.7817-4-johannes@sipsolutions.net> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user-glib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/libvhost-user/libvhost-user-glib.c b/contrib/libvhost-user/libvhost-user-glib.c index 824c7780de..53f1ca4cdd 100644 --- a/contrib/libvhost-user/libvhost-user-glib.c +++ b/contrib/libvhost-user/libvhost-user-glib.c @@ -89,7 +89,7 @@ vug_source_new(VugDev *gdev, int fd, GIOCondition cond, src->gfd.events = cond; g_source_add_poll(gsrc, &src->gfd); - id = g_source_attach(gsrc, NULL); + id = g_source_attach(gsrc, g_main_context_get_thread_default()); g_assert(id); return gsrc; From d5f99fc578d5f4f2ab731ec692f34ce875d870fe Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Jan 2020 09:17:06 +0100 Subject: [PATCH 25/30] libvhost-user: handle NOFD flag in call/kick/err better The code here is odd, for example will it print out invalid file descriptor numbers that were never sent in the message. Clean that up a bit so it's actually possible to implement a device that uses polling. Signed-off-by: Johannes Berg Message-Id: <20200123081708.7817-5-johannes@sipsolutions.net> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 533d55d82a..3abc9689e5 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -948,6 +948,7 @@ static bool vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) { int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; if (index >= dev->max_queues) { vmsg_close_fds(vmsg); @@ -955,8 +956,12 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) return false; } - if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK || - vmsg->fd_num != 1) { + if (nofd) { + vmsg_close_fds(vmsg); + return true; + } + + if (vmsg->fd_num != 1) { vmsg_close_fds(vmsg); vu_panic(dev, "Invalid fds in request: %d", vmsg->request); return false; @@ -1053,6 +1058,7 @@ static bool vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) { int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); @@ -1066,8 +1072,8 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) dev->vq[index].kick_fd = -1; } - dev->vq[index].kick_fd = vmsg->fds[0]; - DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); + dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0]; + DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); dev->vq[index].started = true; if (dev->iface->queue_set_started) { @@ -1147,6 +1153,7 @@ static bool vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) { int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); @@ -1159,14 +1166,14 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) dev->vq[index].call_fd = -1; } - dev->vq[index].call_fd = vmsg->fds[0]; + dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0]; /* in case of I/O hang after reconnecting */ - if (eventfd_write(vmsg->fds[0], 1)) { + if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) { return -1; } - DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); + DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index); return false; } @@ -1175,6 +1182,7 @@ static bool vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) { int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); @@ -1187,7 +1195,7 @@ vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) dev->vq[index].err_fd = -1; } - dev->vq[index].err_fd = vmsg->fds[0]; + dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0]; return false; } From 3348e7e34ff945340dbcf310d29d4feb5de20ef6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Jan 2020 09:17:07 +0100 Subject: [PATCH 26/30] docs: vhost-user: add in-band kick/call messages For good reason, vhost-user is currently built asynchronously, that way better performance can be obtained. However, for certain use cases such as simulation, this is problematic. Consider an event-based simulation in which both the device and CPU have scheduled according to a simulation "calendar". Now, consider the CPU sending I/O to the device, over a vring in the vhost-user protocol. In this case, the CPU must wait for the vring interrupt to have been processed by the device, so that the device is able to put an entry onto the simulation calendar to obtain time to handle the interrupt. Note that this doesn't mean the I/O is actually done at this time, it just means that the handling of it is scheduled before the CPU can continue running. This cannot be done with the asynchronous eventfd based vring kick and call design. Extend the protocol slightly, so that a message can be used for kick and call instead, if VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS is negotiated. This in itself doesn't guarantee synchronisation, but both sides can also negotiate VHOST_USER_PROTOCOL_F_REPLY_ACK and thus get a reply to this message by setting the need_reply flag, and ensure synchronisation this way. To really use it in both directions, VHOST_USER_PROTOCOL_F_SLAVE_REQ is also needed. Since it is used for simulation purposes and too many messages on the socket can lock up the virtual machine, document that this should only be used together with the mentioned features. Signed-off-by: Johannes Berg Message-Id: <20200123081708.7817-6-johannes@sipsolutions.net> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/interop/vhost-user.rst | 122 ++++++++++++++++++++++++++++++------ 1 file changed, 104 insertions(+), 18 deletions(-) diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 5f8b3a456b..401652397c 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -2,6 +2,7 @@ Vhost-user Protocol =================== :Copyright: 2014 Virtual Open Systems Sarl. +:Copyright: 2019 Intel Corporation :Licence: This work is licensed under the terms of the GNU GPL, version 2 or later. See the COPYING file in the top-level directory. @@ -279,6 +280,9 @@ If *master* is unable to send the full message or receives a wrong reply it will close the connection. An optional reconnection mechanism can be implemented. +If *slave* detects some error such as incompatible features, it may also +close the connection. This should only happen in exceptional circumstances. + Any protocol extensions are gated by protocol feature bits, which allows full backwards compatibility on both master and slave. As older slaves don't support negotiating protocol features, a feature @@ -315,7 +319,8 @@ it until ring is started, or after it has been stopped. Client must start ring upon receiving a kick (that is, detecting that file descriptor is readable) on the descriptor specified by -``VHOST_USER_SET_VRING_KICK``, and stop ring upon receiving +``VHOST_USER_SET_VRING_KICK`` or receiving the in-band message +``VHOST_USER_VRING_KICK`` if negotiated, and stop ring upon receiving ``VHOST_USER_GET_VRING_BASE``. While processing the rings (whether they are enabled or not), client @@ -767,25 +772,49 @@ When reconnecting: #. Resubmit inflight ``DescStatePacked`` entries in order of their counter value +In-band notifications +--------------------- + +In some limited situations (e.g. for simulation) it is desirable to +have the kick, call and error (if used) signals done via in-band +messages instead of asynchronous eventfd notifications. This can be +done by negotiating the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` +protocol feature. + +Note that due to the fact that too many messages on the sockets can +cause the sending application(s) to block, it is not advised to use +this feature unless absolutely necessary. It is also considered an +error to negotiate this feature without also negotiating +``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` and ``VHOST_USER_PROTOCOL_F_REPLY_ACK``, +the former is necessary for getting a message channel from the slave +to the master, while the latter needs to be used with the in-band +notification messages to block until they are processed, both to avoid +blocking later and for proper processing (at least in the simulation +use case.) As it has no other way of signalling this error, the slave +should close the connection as a response to a +``VHOST_USER_SET_PROTOCOL_FEATURES`` message that sets the in-band +notifications feature flag without the other two. + Protocol features ----------------- .. code:: c - #define VHOST_USER_PROTOCOL_F_MQ 0 - #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 - #define VHOST_USER_PROTOCOL_F_RARP 2 - #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 - #define VHOST_USER_PROTOCOL_F_MTU 4 - #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 - #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 - #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 - #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 - #define VHOST_USER_PROTOCOL_F_CONFIG 9 - #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 - #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 - #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 - #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13 + #define VHOST_USER_PROTOCOL_F_MQ 0 + #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 + #define VHOST_USER_PROTOCOL_F_RARP 2 + #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 + #define VHOST_USER_PROTOCOL_F_MTU 4 + #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 + #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 + #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 + #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 + #define VHOST_USER_PROTOCOL_F_CONFIG 9 + #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 + #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 + #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 + #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13 + #define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14 Master message types -------------------- @@ -947,7 +976,12 @@ Master message types Bits (0-7) of the payload contain the vring index. Bit 8 is the invalid FD flag. This flag is set when there is no file descriptor in the ancillary data. This signals that polling should be used - instead of waiting for a kick. + instead of waiting for the kick. Note that if the protocol feature + ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` has been negotiated + this message isn't necessary as the ring is also started on the + ``VHOST_USER_VRING_KICK`` message, it may however still be used to + set an event file descriptor (which will be preferred over the + message) or to enable polling. ``VHOST_USER_SET_VRING_CALL`` :id: 13 @@ -960,7 +994,12 @@ Master message types Bits (0-7) of the payload contain the vring index. Bit 8 is the invalid FD flag. This flag is set when there is no file descriptor in the ancillary data. This signals that polling will be used - instead of waiting for the call. + instead of waiting for the call. Note that if the protocol features + ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` and + ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` have been negotiated this message + isn't necessary as the ``VHOST_USER_SLAVE_VRING_CALL`` message can be + used, it may however still be used to set an event file descriptor + or to enable polling. ``VHOST_USER_SET_VRING_ERR`` :id: 14 @@ -972,7 +1011,12 @@ Master message types Bits (0-7) of the payload contain the vring index. Bit 8 is the invalid FD flag. This flag is set when there is no file descriptor - in the ancillary data. + in the ancillary data. Note that if the protocol features + ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` and + ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` have been negotiated this message + isn't necessary as the ``VHOST_USER_SLAVE_VRING_ERR`` message can be + used, it may however still be used to set an event file descriptor + (which will be preferred over the message). ``VHOST_USER_GET_QUEUE_NUM`` :id: 17 @@ -1205,6 +1249,20 @@ Master message types Only valid if the ``VHOST_USER_PROTOCOL_F_RESET_DEVICE`` protocol feature is set by the backend. +``VHOST_USER_VRING_KICK`` + :id: 35 + :equivalent ioctl: N/A + :slave payload: vring state description + :master payload: N/A + + When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol + feature has been successfully negotiated, this message may be + submitted by the master to indicate that a buffer was added to + the vring instead of signalling it using the vring's kick file + descriptor or having the slave rely on polling. + + The state.num field is currently reserved and must be set to 0. + Slave message types ------------------- @@ -1261,6 +1319,34 @@ Slave message types ``VHOST_USER_PROTOCOL_F_HOST_NOTIFIER`` protocol feature has been successfully negotiated. +``VHOST_USER_SLAVE_VRING_CALL`` + :id: 4 + :equivalent ioctl: N/A + :slave payload: vring state description + :master payload: N/A + + When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol + feature has been successfully negotiated, this message may be + submitted by the slave to indicate that a buffer was used from + the vring instead of signalling this using the vring's call file + descriptor or having the master relying on polling. + + The state.num field is currently reserved and must be set to 0. + +``VHOST_USER_SLAVE_VRING_ERR`` + :id: 5 + :equivalent ioctl: N/A + :slave payload: vring state description + :master payload: N/A + + When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol + feature has been successfully negotiated, this message may be + submitted by the slave to indicate that an error occurred on the + specific vring, instead of signalling the error file descriptor + set by the master via ``VHOST_USER_SET_VRING_ERR``. + + The state.num field is currently reserved and must be set to 0. + .. _reply_ack: VHOST_USER_PROTOCOL_F_REPLY_ACK From ff1320050a3a7af8b094020c5e4bb6ad736421a8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Jan 2020 09:17:08 +0100 Subject: [PATCH 27/30] libvhost-user: implement in-band notifications Add support for VHOST_USER_PROTOCOL_F_IN_BAND_NOTIFICATIONS, but as it's not desired by default, don't enable it unless the device implementation opts in by returning it from its protocol features callback. Note that I updated vu_set_vring_err_exec(), but didn't add any sending of the VHOST_USER_SLAVE_VRING_ERR message as there's no write to the err_fd today either. This also adds vu_queue_notify_sync() which can be used to force a synchronous notification if inband notifications are supported. Previously, I had left out the slave->master direction handling of F_REPLY_ACK, this now adds some code to support it as well. Signed-off-by: Johannes Berg Message-Id: <20200123081708.7817-7-johannes@sipsolutions.net> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user.c | 103 +++++++++++++++++++++++++- contrib/libvhost-user/libvhost-user.h | 14 ++++ 2 files changed, 114 insertions(+), 3 deletions(-) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 3abc9689e5..3bca996c62 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -136,6 +136,7 @@ vu_request_to_string(unsigned int req) REQ(VHOST_USER_GET_INFLIGHT_FD), REQ(VHOST_USER_SET_INFLIGHT_FD), REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), REQ(VHOST_USER_MAX), }; #undef REQ @@ -163,7 +164,10 @@ vu_panic(VuDev *dev, const char *msg, ...) dev->panic(dev, buf); free(buf); - /* FIXME: find a way to call virtio_error? */ + /* + * FIXME: + * find a way to call virtio_error, or perhaps close the connection? + */ } /* Translate guest physical address to our virtual address. */ @@ -1203,6 +1207,14 @@ vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) static bool vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) { + /* + * Note that we support, but intentionally do not set, + * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that + * a device implementation can return it in its callback + * (get_protocol_features) if it wants to use this for + * simulation, but it is otherwise not desirable (if even + * implemented by the master.) + */ uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ | 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | @@ -1235,6 +1247,25 @@ vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) dev->protocol_features = vmsg->payload.u64; + if (vu_has_protocol_feature(dev, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ) || + !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { + /* + * The use case for using messages for kick/call is simulation, to make + * the kick and call synchronous. To actually get that behaviour, both + * of the other features are required. + * Theoretically, one could use only kick messages, or do them without + * having F_REPLY_ACK, but too many (possibly pending) messages on the + * socket will eventually cause the master to hang, to avoid this in + * scenarios where not desired enforce that the settings are in a way + * that actually enables the simulation case. + */ + vu_panic(dev, + "F_IN_BAND_NOTIFICATIONS requires F_SLAVE_REQ && F_REPLY_ACK"); + return false; + } + if (dev->iface->set_protocol_features) { dev->iface->set_protocol_features(dev, features); } @@ -1495,6 +1526,34 @@ vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) return false; } +static bool +vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + + if (index >= dev->max_queues) { + vu_panic(dev, "Invalid queue index: %u", index); + return false; + } + + DPRINT("Got kick message: handler:%p idx:%d\n", + dev->vq[index].handler, index); + + if (!dev->vq[index].started) { + dev->vq[index].started = true; + + if (dev->iface->queue_set_started) { + dev->iface->queue_set_started(dev, index, true); + } + } + + if (dev->vq[index].handler) { + dev->vq[index].handler(dev, index); + } + + return false; +} + static bool vu_process_message(VuDev *dev, VhostUserMsg *vmsg) { @@ -1577,6 +1636,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) return vu_get_inflight_fd(dev, vmsg); case VHOST_USER_SET_INFLIGHT_FD: return vu_set_inflight_fd(dev, vmsg); + case VHOST_USER_VRING_KICK: + return vu_handle_vring_kick(dev, vmsg); default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); @@ -2038,8 +2099,7 @@ vring_notify(VuDev *dev, VuVirtq *vq) return !v || vring_need_event(vring_get_used_event(vq), new, old); } -void -vu_queue_notify(VuDev *dev, VuVirtq *vq) +static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) { if (unlikely(dev->broken) || unlikely(!vq->vring.avail)) { @@ -2051,11 +2111,48 @@ vu_queue_notify(VuDev *dev, VuVirtq *vq) return; } + if (vq->call_fd < 0 && + vu_has_protocol_feature(dev, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { + VhostUserMsg vmsg = { + .request = VHOST_USER_SLAVE_VRING_CALL, + .flags = VHOST_USER_VERSION, + .size = sizeof(vmsg.payload.state), + .payload.state = { + .index = vq - dev->vq, + }, + }; + bool ack = sync && + vu_has_protocol_feature(dev, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + + if (ack) { + vmsg.flags |= VHOST_USER_NEED_REPLY_MASK; + } + + vu_message_write(dev, dev->slave_fd, &vmsg); + if (ack) { + vu_message_read(dev, dev->slave_fd, &vmsg); + } + return; + } + if (eventfd_write(vq->call_fd, 1) < 0) { vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); } } +void vu_queue_notify(VuDev *dev, VuVirtq *vq) +{ + _vu_queue_notify(dev, vq, false); +} + +void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq) +{ + _vu_queue_notify(dev, vq, true); +} + static inline void vring_used_flags_set_bit(VuVirtq *vq, int mask) { diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index 5cb7708559..6fc8000e99 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -54,6 +54,7 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, VHOST_USER_PROTOCOL_F_MAX }; @@ -95,6 +96,7 @@ typedef enum VhostUserRequest { VHOST_USER_GET_INFLIGHT_FD = 31, VHOST_USER_SET_INFLIGHT_FD = 32, VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, VHOST_USER_MAX } VhostUserRequest; @@ -103,6 +105,8 @@ typedef enum VhostUserSlaveRequest { VHOST_USER_SLAVE_IOTLB_MSG = 1, VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, + VHOST_USER_SLAVE_VRING_CALL = 4, + VHOST_USER_SLAVE_VRING_ERR = 5, VHOST_USER_SLAVE_MAX } VhostUserSlaveRequest; @@ -528,6 +532,16 @@ bool vu_queue_empty(VuDev *dev, VuVirtq *vq); */ void vu_queue_notify(VuDev *dev, VuVirtq *vq); +/** + * vu_queue_notify_sync: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * + * Request to notify the queue via callfd (skipped if unnecessary) + * or sync message if possible. + */ +void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq); + /** * vu_queue_pop: * @dev: a VuDev context From fd9b0830b0be0ea89cf30ad846a232e0c442a18a Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 29 Jan 2020 15:06:21 +0100 Subject: [PATCH 28/30] acpi: cpuhp: document CPHP_GET_CPU_ID_CMD command Commit 3a61c8db9d25 introduced CPHP_GET_CPU_ID_CMD command but did not sufficiently describe it. Fix it by adding missing command documentation. Fixes: 3a61c8db9d25 ("acpi: cpuhp: add CPHP_GET_CPU_ID_CMD command") Signed-off-by: Igor Mammedov Reviewed-by: Laszlo Ersek Message-Id: <1580306781-228371-1-git-send-email-imammedo@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/specs/acpi_cpu_hotplug.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/specs/acpi_cpu_hotplug.txt b/docs/specs/acpi_cpu_hotplug.txt index a8ce5e7402..9bb22d1270 100644 --- a/docs/specs/acpi_cpu_hotplug.txt +++ b/docs/specs/acpi_cpu_hotplug.txt @@ -94,6 +94,8 @@ write access: register in QEMU 2: following writes to 'Command data' register set OST status register in QEMU + 3: following reads from 'Command data' and 'Command data 2' return + architecture specific CPU ID value for currently selected CPU. other values: reserved [0x6-0x7] reserved [0x8] Command data: (DWORD access) From 67b3965e89f8466fc6a61caa2142dd2c89257383 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 21 Jan 2020 22:45:53 +0100 Subject: [PATCH 29/30] vhost-user: only set slave channel for first vq When multiqueue is enabled, a vhost_dev is created for each queue pair. However, only one slave channel is needed. Fixes: 4bbeeba023f2 (vhost-user: add slave-req-fd support) Cc: marcandre.lureau@redhat.com Signed-off-by: Adrian Moreno Message-Id: <20200121214553.28459-1-amorenoz@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 2e81f5514f..35baf4f347 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1458,9 +1458,11 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque) "VHOST_USER_PROTOCOL_F_LOG_SHMFD feature."); } - err = vhost_setup_slave_channel(dev); - if (err < 0) { - return err; + if (dev->vq_index == 0) { + err = vhost_setup_slave_channel(dev); + if (err < 0) { + return err; + } } u->postcopy_notifier.notify = vhost_user_postcopy_notifier; From b844a4c77b618acfba6b3f4ce12d2ad709f99279 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Wed, 15 Jan 2020 21:57:04 -0500 Subject: [PATCH 30/30] Fixed assert in vhost_user_set_mem_table_postcopy The current vhost_user_set_mem_table_postcopy() implementation populates each region of the VHOST_USER_SET_MEM_TABLE message without first checking if there are more than VHOST_MEMORY_MAX_NREGIONS already populated. This can cause memory corruption if too many regions are added to the message during the postcopy step. This change moves an existing assert up such that attempting to construct a VHOST_USER_SET_MEM_TABLE message with too many memory regions will gracefully bring down qemu instead of corrupting memory. Signed-off-by: Raphael Norwitz Signed-off-by: Peter Turschmid Message-Id: <1579143426-18305-2-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 35baf4f347..08e7e63790 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -443,6 +443,7 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, &offset); fd = memory_region_get_fd(mr); if (fd > 0) { + assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); trace_vhost_user_set_mem_table_withfd(fd_num, mr->name, reg->memory_size, reg->guest_phys_addr, @@ -455,7 +456,6 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr; msg.payload.memory.regions[fd_num].mmap_offset = offset; - assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); fds[fd_num++] = fd; } else { u->region_rb_offset[i] = 0;