Migration Pull request

In this try - rebase to latest upstream - same than previous patch - fix compilation on non linux (userfaultfd.h) (me) - query-migrationthreads (jiang) - fix race on reading MultiFDPages_t.block (zhenzhong) - fix flush of zero copy page send reuest (zhenzhong) Please apply. Previous try: It includes: - David Hildenbrand fixes for virtio-men - David Gilbert canary to detect problems - Fix for rdma return values (Fiona) - Peter Xu uffd_open fixes - Peter Xu show right downtime for postcopy - manish.mishra msg fix fixes - my vfio changes. Please apply. -----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEGJn/jt6/WMzuA0uC9IfvGFhy1yMFAmPhobYACgkQ9IfvGFhy 1yMNaA/9EHDPqrI1HL/VkJG4nNOOsQR7RbburXEberZOzvLjnqpjUD3Ls9qV6rx+ ieHa5T4imYJFk72Wa5vx4r1/dCjtJD2W6jg5+/0nTvYAHrs1U1VRqpuTr0HiXdbJ ZLLCnW5eDyO3eMaOX0MUkgHgL0FNkc/Lq5ViCTFsMu9O9xMuDLLdAC3cdvslKuOu X1gKByr9jT817Y9e36amYmRaJKC6Cr/PIekNVFu12HBW79pPusLX8KWEf4RBw4HR sPwTvMCR/BwZ0+2Lppan60G5rt/ZxDu40oU7y+RHlfWqevl4hDM84/nhjMvEgzc5 a4Ahe2ERGLwwnC8z3l7v9+pEzSGzDoPcnRGvZcpUpk68wTDtxd5Bdq8CwmNUfL07 VzWcYpH0yvmwjBba9jfn9fAVgnG5rVp558XcYLIII3wEToty3UDtm43wSdj2CGr6 cu+IPAp+n/I5G9SRYBTU9ozJz45ttnEe0hxUtZ4I3MuhzHi1VEDAqTWM/X0LyS41 TB3Y5B2KKpJYbPyZEH4nyTeetR2k7alTFzahCgKqVfOgL0nJx54petjS1K+B1P72 g6lhP9WnQ33W+M8S7J/aGEaDJd1lFyFB2Rdjn2ZZnASH/fR9j0mFmXWvulXtjFNp Sfim3887+Iv4Uzw4VWEe3mM5Ypi/Ba2CmuTjy/pM08Ey8X1Qs5o= =ZQbR -----END PGP SIGNATURE----- Merge tag 'migration-20230206-pull-request' of https://gitlab.com/juan.quintela/qemu into staging Migration Pull request In this try - rebase to latest upstream - same than previous patch - fix compilation on non linux (userfaultfd.h) (me) - query-migrationthreads (jiang) - fix race on reading MultiFDPages_t.block (zhenzhong) - fix flush of zero copy page send reuest (zhenzhong) Please apply. Previous try: It includes: - David Hildenbrand fixes for virtio-men - David Gilbert canary to detect problems - Fix for rdma return values (Fiona) - Peter Xu uffd_open fixes - Peter Xu show right downtime for postcopy - manish.mishra msg fix fixes - my vfio changes. Please apply. # -----BEGIN PGP SIGNATURE----- # # iQIzBAABCAAdFiEEGJn/jt6/WMzuA0uC9IfvGFhy1yMFAmPhobYACgkQ9IfvGFhy # 1yMNaA/9EHDPqrI1HL/VkJG4nNOOsQR7RbburXEberZOzvLjnqpjUD3Ls9qV6rx+ # ieHa5T4imYJFk72Wa5vx4r1/dCjtJD2W6jg5+/0nTvYAHrs1U1VRqpuTr0HiXdbJ # ZLLCnW5eDyO3eMaOX0MUkgHgL0FNkc/Lq5ViCTFsMu9O9xMuDLLdAC3cdvslKuOu # X1gKByr9jT817Y9e36amYmRaJKC6Cr/PIekNVFu12HBW79pPusLX8KWEf4RBw4HR # sPwTvMCR/BwZ0+2Lppan60G5rt/ZxDu40oU7y+RHlfWqevl4hDM84/nhjMvEgzc5 # a4Ahe2ERGLwwnC8z3l7v9+pEzSGzDoPcnRGvZcpUpk68wTDtxd5Bdq8CwmNUfL07 # VzWcYpH0yvmwjBba9jfn9fAVgnG5rVp558XcYLIII3wEToty3UDtm43wSdj2CGr6 # cu+IPAp+n/I5G9SRYBTU9ozJz45ttnEe0hxUtZ4I3MuhzHi1VEDAqTWM/X0LyS41 # TB3Y5B2KKpJYbPyZEH4nyTeetR2k7alTFzahCgKqVfOgL0nJx54petjS1K+B1P72 # g6lhP9WnQ33W+M8S7J/aGEaDJd1lFyFB2Rdjn2ZZnASH/fR9j0mFmXWvulXtjFNp # Sfim3887+Iv4Uzw4VWEe3mM5Ypi/Ba2CmuTjy/pM08Ey8X1Qs5o= # =ZQbR # -----END PGP SIGNATURE----- # gpg: Signature made Tue 07 Feb 2023 00:56:22 GMT # gpg: using RSA key 1899FF8EDEBF58CCEE034B82F487EF185872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" [full] # gpg: aka "Juan Quintela <quintela@trasno.org>" [full] # Primary key fingerprint: 1899 FF8E DEBF 58CC EE03 4B82 F487 EF18 5872 D723 * tag 'migration-20230206-pull-request' of https://gitlab.com/juan.quintela/qemu: (30 commits) migration: save/delete migration thread info migration: Introduce interface query-migrationthreads multifd: Fix flush of zero copy page send request multifd: Fix a race on reading MultiFDPages_t.block migration: check magic value for deciding the mapping of channels io: Add support for MSG_PEEK for socket channel migration/dirtyrate: Show sample pages only in page-sampling mode migration: Perform vmsd structure check during tests migration: Add canary to VMSTATE_END_OF_LIST migration/rdma: fix return value for qio_channel_rdma_{readv,writev} migration: Show downtime during postcopy phase virtio-mem: Proper support for preallocation with migration virtio-mem: Migrate immutable properties early virtio-mem: Fail if a memory backend with "prealloc=on" is specified migration/ram: Factor out check for advised postcopy migration/vmstate: Introduce VMSTATE_WITH_TMP_TEST() and VMSTATE_BITMAP_TEST() migration/savevm: Allow immutable device state to be migrated early (i.e., before RAM) migration/savevm: Prepare vmdesc json writer in qemu_savevm_state_setup() migration/savevm: Move more savevm handling into vmstate_save() migration/ram: Optimize ram_write_tracking_start() for RamDiscardManager ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2023-02-07 15:16:51 +00:00 · 2023-02-07 15:16:51 +00:00 · b86307ecef
parent 285ee77f5b 1b1f4ab69c
commit b86307ecef
53 changed files with 2135 additions and 234 deletions
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len)
    if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
        ret = qio_channel_readv_full(s->ioc, &iov, 1,
                                     &msgfds, &msgfds_num,
-                                     NULL);
+                                     0, NULL);
    } else {
        ret = qio_channel_readv_full(s->ioc, &iov, 1,
                                     NULL, NULL,
-                                     NULL);
+                                     0, NULL);
    }

    if (msgfds_num) {
--- a/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak
+++ b/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak
@ -0,0 +1,7 @@
+# Boards:
+#
+CONFIG_ISAPC=n
+CONFIG_I440FX=n
+CONFIG_Q35=n
+CONFIG_MICROVM=y
+
--- a/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak
+++ b/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak
@ -0,0 +1,6 @@
+# Boards:
+#
+CONFIG_ISAPC=y
+CONFIG_I440FX=y
+CONFIG_Q35=y
+CONFIG_MICROVM=y
--- a/docs/devel/migration.rst
+++ b/docs/devel/migration.rst
@ -482,15 +482,17 @@ An iterative device must provide:
  - A ``load_setup`` function that initialises the data structures on the
    destination.

-  - A ``save_live_pending`` function that is called repeatedly and must
-    indicate how much more data the iterative data must save.  The core
-    migration code will use this to determine when to pause the CPUs
-    and complete the migration.
+  - A ``state_pending_exact`` function that indicates how much more
+    data we must save.  The core migration code will use this to
+    determine when to pause the CPUs and complete the migration.

-  - A ``save_live_iterate`` function (called after ``save_live_pending``
-    when there is significant data still to be sent).  It should send
-    a chunk of data until the point that stream bandwidth limits tell it
-    to stop.  Each call generates one section.
+  - A ``state_pending_estimate`` function that indicates how much more
+    data we must save.  When the estimated amount is smaller than the
+    threshold, we call ``state_pending_exact``.
+
+  - A ``save_live_iterate`` function should send a chunk of data until
+    the point that stream bandwidth limits tell it to stop.  Each call
+    generates one section.

  - A ``save_live_complete_precopy`` function that must transmit the
    last section for the device containing any remaining data.
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@ -28,7 +28,7 @@ VFIO implements the device hooks for the iterative approach as follows:
 * A ``load_setup`` function that sets up the migration region on the
  destination and sets _RESUMING flag in the VFIO device state.

-* A ``save_live_pending`` function that reads pending_bytes from the vendor
+* A ``state_pending_exact`` function that reads pending_bytes from the vendor
  driver, which indicates the amount of data that the vendor driver has yet to
  save for the VFIO device.

@ -114,7 +114,7 @@ Live migration save path
                    (RUNNING, _SETUP, _RUNNING|_SAVING)
                                  |
                    (RUNNING, _ACTIVE, _RUNNING|_SAVING)
-             If device is active, get pending_bytes by .save_live_pending()
+             If device is active, get pending_bytes by .state_pending_exact()
          If total pending_bytes >= threshold_size, call .save_live_iterate()
                  Data of VFIO device for pre-copy phase is copied
        Iterate till total pending bytes converge and are less than threshold
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@ -41,7 +41,9 @@
 #include "hw/virtio/virtio-pci.h"
 #include "qom/object_interfaces.h"

-GlobalProperty hw_compat_7_2[] = {};
+GlobalProperty hw_compat_7_2[] = {
+    { "virtio-mem", "x-early-migration", "false" },
+};
 const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);

 GlobalProperty hw_compat_7_1[] = {
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@ -182,10 +182,10 @@ static int cmma_save_setup(QEMUFile *f, void *opaque)
    return 0;
 }

-static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
-                              uint64_t *res_precopy_only,
-                              uint64_t *res_compatible,
-                              uint64_t *res_postcopy_only)
+static void cmma_state_pending(void *opaque,
+                               uint64_t *res_precopy_only,
+                               uint64_t *res_compatible,
+                               uint64_t *res_postcopy_only)
 {
    S390StAttribState *sas = S390_STATTRIB(opaque);
    S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
@ -371,7 +371,8 @@ static SaveVMHandlers savevm_s390_stattrib_handlers = {
    .save_setup = cmma_save_setup,
    .save_live_iterate = cmma_save_iterate,
    .save_live_complete_precopy = cmma_save_complete,
-    .save_live_pending = cmma_save_pending,
+    .state_pending_exact = cmma_state_pending,
+    .state_pending_estimate = cmma_state_pending,
    .save_cleanup = cmma_save_cleanup,
    .load_state = cmma_load,
    .is_active = cmma_active,
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@ -456,11 +456,10 @@ static void vfio_save_cleanup(void *opaque)
    trace_vfio_save_cleanup(vbasedev->name);
 }

-static void vfio_save_pending(QEMUFile *f, void *opaque,
-                              uint64_t threshold_size,
-                              uint64_t *res_precopy_only,
-                              uint64_t *res_compatible,
-                              uint64_t *res_postcopy_only)
+static void vfio_state_pending(void *opaque,
+                               uint64_t *res_precopy_only,
+                               uint64_t *res_compatible,
+                               uint64_t *res_postcopy_only)
 {
    VFIODevice *vbasedev = opaque;
    VFIOMigration *migration = vbasedev->migration;
@ -473,7 +472,7 @@ static void vfio_save_pending(QEMUFile *f, void *opaque,

    *res_precopy_only += migration->pending_bytes;

-    trace_vfio_save_pending(vbasedev->name, *res_precopy_only,
+    trace_vfio_state_pending(vbasedev->name, *res_precopy_only,
                            *res_postcopy_only, *res_compatible);
 }

@ -515,9 +514,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
    }

    /*
-     * Reset pending_bytes as .save_live_pending is not called during savevm or
-     * snapshot case, in such case vfio_update_pending() at the start of this
-     * function updates pending_bytes.
+     * Reset pending_bytes as state_pending* are not called during
+     * savevm or snapshot case, in such case vfio_update_pending() at
+     * the start of this function updates pending_bytes.
     */
    migration->pending_bytes = 0;
    trace_vfio_save_iterate(vbasedev->name, data_size);
@ -685,7 +684,8 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
 static SaveVMHandlers savevm_vfio_handlers = {
    .save_setup = vfio_save_setup,
    .save_cleanup = vfio_save_cleanup,
-    .save_live_pending = vfio_save_pending,
+    .state_pending_exact = vfio_state_pending,
+    .state_pending_estimate = vfio_state_pending,
    .save_live_iterate = vfio_save_iterate,
    .save_live_complete_precopy = vfio_save_complete_precopy,
    .save_state = vfio_save_state,
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@ -157,7 +157,7 @@ vfio_save_cleanup(const char *name) " (%s)"
 vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64
 vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64
 vfio_save_device_config_state(const char *name) " (%s)"
-vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
+vfio_state_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
 vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d"
 vfio_save_complete_precopy(const char *name) " (%s)"
 vfio_load_device_config_state(const char *name) " (%s)"
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@ -31,6 +31,8 @@
 #include CONFIG_DEVICES
 #include "trace.h"

+static const VMStateDescription vmstate_virtio_mem_device_early;
+
 /*
 * We only had legacy x86 guests that did not support
 * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
@ -202,6 +204,30 @@ static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
    return ret;
 }

+static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg,
+                                             virtio_mem_range_cb cb)
+{
+    unsigned long first_bit, last_bit;
+    uint64_t offset, size;
+    int ret = 0;
+
+    first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
+    while (first_bit < vmem->bitmap_size) {
+        offset = first_bit * vmem->block_size;
+        last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
+                                      first_bit + 1) - 1;
+        size = (last_bit - first_bit + 1) * vmem->block_size;
+
+        ret = cb(vmem, arg, offset, size);
+        if (ret) {
+            break;
+        }
+        first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
+                                  last_bit + 2);
+    }
+    return ret;
+}
+
 /*
 * Adjust the memory section to cover the intersection with the given range.
 *
@ -772,6 +798,12 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
        error_setg(errp, "'%s' property specifies an unsupported memdev",
                   VIRTIO_MEM_MEMDEV_PROP);
        return;
+    } else if (vmem->memdev->prealloc) {
+        error_setg(errp, "'%s' property specifies a memdev with preallocation"
+                   " enabled: %s. Instead, specify 'prealloc=on' for the"
+                   " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
+                   object_get_canonical_path_component(OBJECT(vmem->memdev)));
+        return;
    }

    if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
@ -872,6 +904,10 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)

    host_memory_backend_set_mapped(vmem->memdev, true);
    vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
+    if (vmem->early_migration) {
+        vmstate_register(VMSTATE_IF(vmem), VMSTATE_INSTANCE_ID_ANY,
+                         &vmstate_virtio_mem_device_early, vmem);
+    }
    qemu_register_reset(virtio_mem_system_reset, vmem);

    /*
@ -893,6 +929,10 @@ static void virtio_mem_device_unrealize(DeviceState *dev)
     */
    memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
    qemu_unregister_reset(virtio_mem_system_reset, vmem);
+    if (vmem->early_migration) {
+        vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
+                           vmem);
+    }
    vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
    host_memory_backend_set_mapped(vmem->memdev, false);
    virtio_del_queue(vdev, 0);
@ -922,6 +962,10 @@ static int virtio_mem_post_load(void *opaque, int version_id)
    RamDiscardListener *rdl;
    int ret;

+    if (vmem->prealloc && !vmem->early_migration) {
+        warn_report("Proper preallocation with migration requires a newer QEMU machine");
+    }
+
    /*
     * We started out with all memory discarded and our memory region is mapped
     * into an address space. Replay, now that we updated the bitmap.
@ -941,6 +985,64 @@ static int virtio_mem_post_load(void *opaque, int version_id)
    return virtio_mem_restore_unplugged(vmem);
 }

+static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg,
+                                        uint64_t offset, uint64_t size)
+{
+    void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
+    int fd = memory_region_get_fd(&vmem->memdev->mr);
+    Error *local_err = NULL;
+
+    qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -ENOMEM;
+    }
+    return 0;
+}
+
+static int virtio_mem_post_load_early(void *opaque, int version_id)
+{
+    VirtIOMEM *vmem = VIRTIO_MEM(opaque);
+    RAMBlock *rb = vmem->memdev->mr.ram_block;
+    int ret;
+
+    if (!vmem->prealloc) {
+        return 0;
+    }
+
+    /*
+     * We restored the bitmap and verified that the basic properties
+     * match on source and destination, so we can go ahead and preallocate
+     * memory for all plugged memory blocks, before actual RAM migration starts
+     * touching this memory.
+     */
+    ret = virtio_mem_for_each_plugged_range(vmem, NULL,
+                                            virtio_mem_prealloc_range_cb);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * This is tricky: postcopy wants to start with a clean slate. On
+     * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
+     * preallocated) RAM such that postcopy will work as expected later.
+     *
+     * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
+     * RAM migration. So let's discard all memory again. This looks like an
+     * expensive NOP, but actually serves a purpose: we made sure that we
+     * were able to allocate all required backend memory once. We cannot
+     * guarantee that the backend memory we will free will remain free
+     * until we need it during postcopy, but at least we can catch the
+     * obvious setup issues this way.
+     */
+    if (migration_incoming_postcopy_advised()) {
+        if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
+            return -EBUSY;
+        }
+    }
+    return 0;
+}
+
 typedef struct VirtIOMEMMigSanityChecks {
    VirtIOMEM *parent;
    uint64_t addr;
@ -1009,18 +1111,54 @@ static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
    },
 };

+static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
+{
+    const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
+
+    /* With early migration, these fields were already migrated. */
+    return !vmem->early_migration;
+}
+
 static const VMStateDescription vmstate_virtio_mem_device = {
    .name = "virtio-mem-device",
    .minimum_version_id = 1,
    .version_id = 1,
    .priority = MIG_PRI_VIRTIO_MEM,
    .post_load = virtio_mem_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
+                              VirtIOMEMMigSanityChecks,
+                              vmstate_virtio_mem_sanity_checks),
+        VMSTATE_UINT64(usable_region_size, VirtIOMEM),
+        VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
+        VMSTATE_UINT64(requested_size, VirtIOMEM),
+        VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
+                            0, bitmap_size),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+/*
+ * Transfer properties that are immutable while migration is active early,
+ * such that we have have this information around before migrating any RAM
+ * content.
+ *
+ * Note that virtio_mem_is_busy() makes sure these properties can no longer
+ * change on the migration source until migration completed.
+ *
+ * With QEMU compat machines, we transmit these properties later, via
+ * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
+ */
+static const VMStateDescription vmstate_virtio_mem_device_early = {
+    .name = "virtio-mem-device-early",
+    .minimum_version_id = 1,
+    .version_id = 1,
+    .early_setup = true,
+    .post_load = virtio_mem_post_load_early,
    .fields = (VMStateField[]) {
        VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
                         vmstate_virtio_mem_sanity_checks),
-        VMSTATE_UINT64(usable_region_size, VirtIOMEM),
        VMSTATE_UINT64(size, VirtIOMEM),
-        VMSTATE_UINT64(requested_size, VirtIOMEM),
        VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
        VMSTATE_END_OF_LIST()
    },
@ -1205,6 +1343,8 @@ static Property virtio_mem_properties[] = {
    DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
                            unplugged_inaccessible, ON_OFF_AUTO_AUTO),
 #endif
+    DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
+                     early_migration, true),
    DEFINE_PROP_END_OF_LIST(),
 };

--- a/include/hw/virtio/virtio-mem.h
+++ b/include/hw/virtio/virtio-mem.h
@ -31,6 +31,7 @@ OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass,
 #define VIRTIO_MEM_BLOCK_SIZE_PROP "block-size"
 #define VIRTIO_MEM_ADDR_PROP "memaddr"
 #define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "unplugged-inaccessible"
+#define VIRTIO_MEM_EARLY_MIGRATION_PROP "x-early-migration"
 #define VIRTIO_MEM_PREALLOC_PROP "prealloc"

 struct VirtIOMEM {
@ -74,6 +75,13 @@ struct VirtIOMEM {
    /* whether to prealloc memory when plugging new blocks */
    bool prealloc;

+    /*
+     * Whether we migrate properties that are immutable while migration is
+     * active early, before state of other devices and especially, before
+     * migrating any RAM content.
+     */
+    bool early_migration;
+
    /* notifiers to notify when "size" changes */
    NotifierList size_change_notifiers;

--- a/include/io/channel.h
+++ b/include/io/channel.h
@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,

 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1

+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;

 enum QIOChannelFeature {
@ -41,6 +43,7 @@ enum QIOChannelFeature {
    QIO_CHANNEL_FEATURE_SHUTDOWN,
    QIO_CHANNEL_FEATURE_LISTEN,
    QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+    QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };


@ -114,6 +117,7 @@ struct QIOChannelClass {
                        size_t niov,
                        int **fds,
                        size_t *nfds,
+                        int flags,
                        Error **errp);
    int (*io_close)(QIOChannel *ioc,
                    Error **errp);
@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
 * @niov: the length of the @iov array
 * @fds: pointer to an array that will received file handles
 * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
 * @errp: pointer to a NULL-initialized error object
 *
 * Read data from the IO channel, storing it in the
@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
                               size_t niov,
                               int **fds,
                               size_t *nfds,
+                               int flags,
                               Error **errp);


--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@ -67,8 +67,10 @@ bool migration_has_failed(MigrationState *);
 /* ...and after the device transmission */
 bool migration_in_postcopy_after_devices(MigrationState *);
 void migration_global_dump(Monitor *mon);
-/* True if incomming migration entered POSTCOPY_INCOMING_DISCARD */
+/* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */
 bool migration_in_incoming_postcopy(void);
+/* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */
+bool migration_incoming_postcopy_advised(void);
 /* True if background snapshot is active */
 bool migration_in_bg_snapshot(void);

--- a/include/migration/register.h
+++ b/include/migration/register.h
@ -46,11 +46,6 @@ typedef struct SaveVMHandlers {

    /* This runs outside the iothread lock!  */
    int (*save_setup)(QEMUFile *f, void *opaque);
-    void (*save_live_pending)(QEMUFile *f, void *opaque,
-                              uint64_t threshold_size,
-                              uint64_t *res_precopy_only,
-                              uint64_t *res_compatible,
-                              uint64_t *res_postcopy_only);
    /* Note for save_live_pending:
     * - res_precopy_only is for data which must be migrated in precopy phase
     *     or in stopped state, in other words - before target vm start
@ -61,8 +56,16 @@ typedef struct SaveVMHandlers {
     * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
     * whole amount of pending data.
     */
-
-
+    /* This estimates the remaining data to transfer */
+    void (*state_pending_estimate)(void *opaque,
+                                   uint64_t *res_precopy_only,
+                                   uint64_t *res_compatible,
+                                   uint64_t *res_postcopy_only);
+    /* This calculate the exact remaining data to transfer */
+    void (*state_pending_exact)(void *opaque,
+                                uint64_t *res_precopy_only,
+                                uint64_t *res_compatible,
+                                uint64_t *res_postcopy_only);
    LoadStateHandler *load_state;
    int (*load_setup)(QEMUFile *f, void *opaque);
    int (*load_cleanup)(void *opaque);
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@ -147,6 +147,9 @@ enum VMStateFlags {
     * VMStateField.struct_version_id to tell which version of the
     * structure we are referencing to use. */
    VMS_VSTRUCT           = 0x8000,
+
+    /* Marker for end of list */
+    VMS_END = 0x10000
 };

 typedef enum {
@ -178,7 +181,21 @@ struct VMStateField {

 struct VMStateDescription {
    const char *name;
-    int unmigratable;
+    bool unmigratable;
+    /*
+     * This VMSD describes something that should be sent during setup phase
+     * of migration. It plays similar role as save_setup() for explicitly
+     * registered vmstate entries, so it can be seen as a way to describe
+     * save_setup() in VMSD structures.
+     *
+     * Note that for now, a SaveStateEntry cannot have a VMSD and
+     * operations (e.g., save_setup()) set at the same time. Consequently,
+     * save_setup() and a VMSD with early_setup set to true are mutually
+     * exclusive. For this reason, also early_setup VMSDs are migrated in a
+     * QEMU_VM_SECTION_FULL section, while save_setup() data is migrated in
+     * a QEMU_VM_SECTION_START section.
+     */
+    bool early_setup;
    int version_id;
    int minimum_version_id;
    MigrationPriority priority;
@ -705,8 +722,9 @@ extern const VMStateInfo vmstate_info_qlist;
 *        '_state' type
 *    That the pointer is right at the start of _tmp_type.
 */
-#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) {                 \
+#define VMSTATE_WITH_TMP_TEST(_state, _test, _tmp_type, _vmsd) {     \
    .name         = "tmp",                                           \
+    .field_exists = (_test),                                         \
    .size         = sizeof(_tmp_type) +                              \
                    QEMU_BUILD_BUG_ON_ZERO(offsetof(_tmp_type, parent) != 0) + \
                    type_check_pointer(_state,                       \
@ -715,6 +733,9 @@ extern const VMStateInfo vmstate_info_qlist;
    .info         = &vmstate_info_tmp,                               \
 }

+#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) \
+    VMSTATE_WITH_TMP_TEST(_state, NULL, _tmp_type, _vmsd)
+
 #define VMSTATE_UNUSED_BUFFER(_test, _version, _size) {              \
    .name         = "unused",                                        \
    .field_exists = (_test),                                         \
@ -738,8 +759,9 @@ extern const VMStateInfo vmstate_info_qlist;
 /* _field_size should be a int32_t field in the _state struct giving the
 * size of the bitmap _field in bits.
 */
-#define VMSTATE_BITMAP(_field, _state, _version, _field_size) {      \
+#define VMSTATE_BITMAP_TEST(_field, _state, _test, _version, _field_size) { \
    .name         = (stringify(_field)),                             \
+    .field_exists = (_test),                                         \
    .version_id   = (_version),                                      \
    .size_offset  = vmstate_offset_value(_state, _field_size, int32_t),\
    .info         = &vmstate_info_bitmap,                            \
@ -747,6 +769,9 @@ extern const VMStateInfo vmstate_info_qlist;
    .offset       = offsetof(_state, _field),                        \
 }

+#define VMSTATE_BITMAP(_field, _state, _version, _field_size) \
+    VMSTATE_BITMAP_TEST(_field, _state, NULL, _version, _field_size)
+
 /* For migrating a QTAILQ.
 * Target QTAILQ needs be properly initialized.
 * _type: type of QTAILQ element
@ -1161,7 +1186,9 @@ extern const VMStateInfo vmstate_info_qlist;
    VMSTATE_UNUSED_BUFFER(_test, 0, _size)

 #define VMSTATE_END_OF_LIST()                                         \
-    {}
+    {                     \
+        .flags = VMS_END, \
+    }

 int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
                       void *opaque, int version_id);
--- a/include/qemu/userfaultfd.h
+++ b/include/qemu/userfaultfd.h
@ -13,10 +13,20 @@
 #ifndef USERFAULTFD_H
 #define USERFAULTFD_H

+#ifdef CONFIG_LINUX
+
 #include "qemu/osdep.h"
 #include "exec/hwaddr.h"
 #include <linux/userfaultfd.h>

+/**
+ * uffd_open(): Open an userfaultfd handle for current context.
+ *
+ * @flags: The flags we want to pass in when creating the handle.
+ *
+ * Returns: the uffd handle if >=0, or <0 if error happens.
+ */
+int uffd_open(int flags);
 int uffd_query_features(uint64_t *features);
 int uffd_create_fd(uint64_t features, bool non_blocking);
 void uffd_close_fd(int uffd_fd);
@ -32,4 +42,6 @@ int uffd_wakeup(int uffd_fd, void *addr, uint64_t length);
 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count);
 bool uffd_poll_events(int uffd_fd, int tmo);

+#endif /* CONFIG_LINUX */
+
 #endif /* USERFAULTFD_H */
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc,
                                        size_t niov,
                                        int **fds,
                                        size_t *nfds,
+                                        int flags,
                                        Error **errp)
 {
    QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
--- a/io/channel-command.c
+++ b/io/channel-command.c
@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
+                                         int flags,
                                         Error **errp)
 {
    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
--- a/io/channel-file.c
+++ b/io/channel-file.c
@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc,
                                      size_t niov,
                                      int **fds,
                                      size_t *nfds,
+                                      int flags,
                                      Error **errp)
 {
    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
--- a/io/channel-null.c
+++ b/io/channel-null.c
@ -60,6 +60,7 @@ qio_channel_null_readv(QIOChannel *ioc,
                       size_t niov,
                       int **fds G_GNUC_UNUSED,
                       size_t *nfds G_GNUC_UNUSED,
+                       int flags,
                       Error **errp)
 {
    QIOChannelNull *nioc = QIO_CHANNEL_NULL(ioc);
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@ -173,6 +173,9 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
    }
 #endif

+    qio_channel_set_feature(QIO_CHANNEL(ioc),
+                            QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+
    return 0;
 }

@ -406,6 +409,9 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
    }
 #endif /* WIN32 */

+    qio_channel_set_feature(QIO_CHANNEL(cioc),
+                            QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+
    trace_qio_channel_socket_accept_complete(ioc, cioc, cioc->fd);
    return cioc;

@ -496,6 +502,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
                                        size_t niov,
                                        int **fds,
                                        size_t *nfds,
+                                        int flags,
                                        Error **errp)
 {
    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
@ -517,6 +524,10 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,

    }

+    if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
+        sflags |= MSG_PEEK;
+    }
+
 retry:
    ret = recvmsg(sioc->fd, &msg, sflags);
    if (ret < 0) {
@ -624,11 +635,17 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
                                        size_t niov,
                                        int **fds,
                                        size_t *nfds,
+                                        int flags,
                                        Error **errp)
 {
    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
    ssize_t done = 0;
    ssize_t i;
+    int sflags = 0;
+
+    if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
+        sflags |= MSG_PEEK;
+    }

    for (i = 0; i < niov; i++) {
        ssize_t ret;
@ -636,7 +653,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
        ret = recv(sioc->fd,
                   iov[i].iov_base,
                   iov[i].iov_len,
-                   0);
+                   sflags);
        if (ret < 0) {
            if (errno == EAGAIN) {
                if (done) {
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@ -260,6 +260,7 @@ static ssize_t qio_channel_tls_readv(QIOChannel *ioc,
                                     size_t niov,
                                     int **fds,
                                     size_t *nfds,
+                                     int flags,
                                     Error **errp)
 {
    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
--- a/io/channel-websock.c
+++ b/io/channel-websock.c
@ -1081,6 +1081,7 @@ static ssize_t qio_channel_websock_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
+                                         int flags,
                                         Error **errp)
 {
    QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc);
--- a/io/channel.c
+++ b/io/channel.c
@ -52,6 +52,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
                               size_t niov,
                               int **fds,
                               size_t *nfds,
+                               int flags,
                               Error **errp)
 {
    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
@ -63,7 +64,14 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
        return -1;
    }

-    return klass->io_readv(ioc, iov, niov, fds, nfds, errp);
+    if ((flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) &&
+        !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+        error_setg_errno(errp, EINVAL,
+                         "Channel does not support peek read");
+        return -1;
+    }
+
+    return klass->io_readv(ioc, iov, niov, fds, nfds, flags, errp);
 }


@ -146,7 +154,7 @@ int qio_channel_readv_full_all_eof(QIOChannel *ioc,
    while ((nlocal_iov > 0) || local_fds) {
        ssize_t len;
        len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
-                                     local_nfds, errp);
+                                     local_nfds, 0, errp);
        if (len == QIO_CHANNEL_ERR_BLOCK) {
            if (qemu_in_coroutine()) {
                qio_channel_yield(ioc, G_IO_IN);
@ -284,7 +292,7 @@ ssize_t qio_channel_readv(QIOChannel *ioc,
                          size_t niov,
                          Error **errp)
 {
-    return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, errp);
+    return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, 0, errp);
 }


@ -303,7 +311,7 @@ ssize_t qio_channel_read(QIOChannel *ioc,
                         Error **errp)
 {
    struct iovec iov = { .iov_base = buf, .iov_len = buflen };
-    return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, errp);
+    return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, 0, errp);
 }


--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@ -762,11 +762,10 @@ static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
    return 0;
 }

-static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
-                                      uint64_t max_size,
-                                      uint64_t *res_precopy_only,
-                                      uint64_t *res_compatible,
-                                      uint64_t *res_postcopy_only)
+static void dirty_bitmap_state_pending(void *opaque,
+                                       uint64_t *res_precopy_only,
+                                       uint64_t *res_compatible,
+                                       uint64_t *res_postcopy_only)
 {
    DBMSaveState *s = &((DBMState *)opaque)->save;
    SaveBitmapState *dbms;
@ -784,7 +783,7 @@ static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,

    qemu_mutex_unlock_iothread();

-    trace_dirty_bitmap_save_pending(pending, max_size);
+    trace_dirty_bitmap_state_pending(pending);

    *res_postcopy_only += pending;
 }
@ -1253,7 +1252,8 @@ static SaveVMHandlers savevm_dirty_bitmap_handlers = {
    .save_live_complete_postcopy = dirty_bitmap_save_complete,
    .save_live_complete_precopy = dirty_bitmap_save_complete,
    .has_postcopy = dirty_bitmap_has_postcopy,
-    .save_live_pending = dirty_bitmap_save_pending,
+    .state_pending_exact = dirty_bitmap_state_pending,
+    .state_pending_estimate = dirty_bitmap_state_pending,
    .save_live_iterate = dirty_bitmap_save_iterate,
    .is_active_iterate = dirty_bitmap_is_active_iterate,
    .load_state = dirty_bitmap_load,
--- a/migration/block.c
+++ b/migration/block.c
@ -863,10 +863,10 @@ static int block_save_complete(QEMUFile *f, void *opaque)
    return 0;
 }

-static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
-                               uint64_t *res_precopy_only,
-                               uint64_t *res_compatible,
-                               uint64_t *res_postcopy_only)
+static void block_state_pending(void *opaque,
+                                uint64_t *res_precopy_only,
+                                uint64_t *res_compatible,
+                                uint64_t *res_postcopy_only)
 {
    /* Estimate pending number of bytes to send */
    uint64_t pending;
@ -885,7 +885,7 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
        pending = BLK_MIG_BLOCK_SIZE;
    }

-    trace_migration_block_save_pending(pending);
+    trace_migration_block_state_pending(pending);
    /* We don't do postcopy */
    *res_precopy_only += pending;
 }
@ -1020,7 +1020,8 @@ static SaveVMHandlers savevm_block_handlers = {
    .save_setup = block_save_setup,
    .save_live_iterate = block_save_iterate,
    .save_live_complete_precopy = block_save_complete,
-    .save_live_pending = block_save_pending,
+    .state_pending_exact = block_state_pending,
+    .state_pending_estimate = block_state_pending,
    .load_state = block_load,
    .save_cleanup = block_migration_cleanup,
    .is_active = block_is_active,
--- a/migration/channel-block.c
+++ b/migration/channel-block.c
@ -53,6 +53,7 @@ qio_channel_block_readv(QIOChannel *ioc,
                        size_t niov,
                        int **fds,
                        size_t *nfds,
+                        int flags,
                        Error **errp)
 {
    QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
--- a/migration/channel.c
+++ b/migration/channel.c
@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s,
    migrate_fd_connect(s, error);
    error_free(error);
 }
+
+
+/**
+ * @migration_channel_read_peek - Peek at migration channel, without
+ *     actually removing it from channel buffer.
+ *
+ * @ioc: the channel object
+ * @buf: the memory region to read data into
+ * @buflen: the number of bytes to read in @buf
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns 0 if successful, returns -1 and sets @errp if fails.
+ */
+int migration_channel_read_peek(QIOChannel *ioc,
+                                const char *buf,
+                                const size_t buflen,
+                                Error **errp)
+{
+    ssize_t len = 0;
+    struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+    while (true) {
+        len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL,
+                                     QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
+
+        if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
+            error_setg(errp,
+                       "Failed to peek at channel");
+            return -1;
+        }
+
+        if (len == buflen) {
+            break;
+        }
+
+        /* 1ms sleep. */
+        if (qemu_in_coroutine()) {
+            qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
+        } else {
+            g_usleep(1000);
+        }
+    }
+
+    return 0;
+}
--- a/migration/channel.h
+++ b/migration/channel.h
@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s,
                               QIOChannel *ioc,
                               const char *hostname,
                               Error *error_in);
+
+int migration_channel_read_peek(QIOChannel *ioc,
+                                const char *buf,
+                                const size_t buflen,
+                                Error **errp);
 #endif
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@ -714,8 +714,8 @@ void qmp_calc_dirty_rate(int64_t calc_time,
        mode =  DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
    }

-    if (has_sample_pages && mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
-        error_setg(errp, "either sample-pages or dirty-ring can be specified.");
+    if (has_sample_pages && mode != DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
+        error_setg(errp, "sample-pages is used only in page-sampling mode");
        return;
    }

@ -785,8 +785,10 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict)
                   DirtyRateStatus_str(info->status));
    monitor_printf(mon, "Start Time: %"PRIi64" (ms)\n",
                   info->start_time);
-    monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
-                   info->sample_pages);
+    if (info->mode == DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
+        monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
+                       info->sample_pages);
+    }
    monitor_printf(mon, "Period: %"PRIi64" (sec)\n",
                   info->calc_time);
    monitor_printf(mon, "Mode: %s\n",
--- a/migration/meson.build
+++ b/migration/meson.build
@ -26,6 +26,7 @@ softmmu_ss.add(files(
  'savevm.c',
  'socket.c',
  'tls.c',
+  'threadinfo.c',
 ), gnutls)

 softmmu_ss.add(when: rdma, if_true: files('rdma.c'))
--- a/migration/migration.c
+++ b/migration/migration.c
@ -31,6 +31,7 @@
 #include "migration.h"
 #include "savevm.h"
 #include "qemu-file.h"
+#include "channel.h"
 #include "migration/vmstate.h"
 #include "block/block.h"
 #include "qapi/error.h"
@ -57,6 +58,7 @@
 #include "net/announce.h"
 #include "qemu/queue.h"
 #include "multifd.h"
+#include "threadinfo.h"
 #include "qemu/yank.h"
 #include "sysemu/cpus.h"
 #include "yank_functions.h"
@ -664,10 +666,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error **errp)
 {
    MigrationIncomingState *mis = migration_incoming_get_current();

-    if (multifd_load_setup(errp) != 0) {
-        return false;
-    }
-
    if (!mis->from_src_file) {
        mis->from_src_file = f;
    }
@ -734,31 +732,56 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
 {
    MigrationIncomingState *mis = migration_incoming_get_current();
    Error *local_err = NULL;
-    bool start_migration;
    QEMUFile *f;
+    bool default_channel = true;
+    uint32_t channel_magic = 0;
+    int ret = 0;

-    if (!mis->from_src_file) {
-        /* The first connection (multifd may have multiple) */
+    if (migrate_use_multifd() && !migrate_postcopy_ram() &&
+        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+        /*
+         * With multiple channels, it is possible that we receive channels
+         * out of order on destination side, causing incorrect mapping of
+         * source channels on destination side. Check channel MAGIC to
+         * decide type of channel. Please note this is best effort, postcopy
+         * preempt channel does not send any magic number so avoid it for
+         * postcopy live migration. Also tls live migration already does
+         * tls handshake while initializing main channel so with tls this
+         * issue is not possible.
+         */
+        ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
+                                          sizeof(channel_magic), &local_err);
+
+        if (ret != 0) {
+            error_propagate(errp, local_err);
+            return;
+        }
+
+        default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
+    } else {
+        default_channel = !mis->from_src_file;
+    }
+
+    if (multifd_load_setup(errp) != 0) {
+        error_setg(errp, "Failed to setup multifd channels");
+        return;
+    }
+
+    if (default_channel) {
        f = qemu_file_new_input(ioc);

        if (!migration_incoming_setup(f, errp)) {
            return;
        }
-
-        /*
-         * Common migration only needs one channel, so we can start
-         * right now.  Some features need more than one channel, we wait.
-         */
-        start_migration = !migration_needs_multiple_sockets();
    } else {
        /* Multiple connections */
        assert(migration_needs_multiple_sockets());
        if (migrate_use_multifd()) {
-            start_migration = multifd_recv_new_channel(ioc, &local_err);
+            multifd_recv_new_channel(ioc, &local_err);
        } else {
            assert(migrate_postcopy_preempt());
            f = qemu_file_new_input(ioc);
-            start_migration = postcopy_preempt_new_channel(mis, f);
+            postcopy_preempt_new_channel(mis, f);
        }
        if (local_err) {
            error_propagate(errp, local_err);
@ -766,7 +789,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
        }
    }

-    if (start_migration) {
+    if (migration_has_all_channels()) {
        /* If it's a recovery, we're done */
        if (postcopy_try_recover()) {
            return;
@ -1051,20 +1074,30 @@ bool migration_is_running(int state)
    }
 }

+static bool migrate_show_downtime(MigrationState *s)
+{
+    return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy();
+}
+
 static void populate_time_info(MigrationInfo *info, MigrationState *s)
 {
    info->has_status = true;
    info->has_setup_time = true;
    info->setup_time = s->setup_time;
+
    if (s->state == MIGRATION_STATUS_COMPLETED) {
        info->has_total_time = true;
        info->total_time = s->total_time;
-        info->has_downtime = true;
-        info->downtime = s->downtime;
    } else {
        info->has_total_time = true;
        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
                           s->start_time;
+    }
+
+    if (migrate_show_downtime(s)) {
+        info->has_downtime = true;
+        info->downtime = s->downtime;
+    } else {
        info->has_expected_downtime = true;
        info->expected_downtime = s->expected_downtime;
    }
@ -1933,6 +1966,8 @@ static void migrate_fd_cleanup(MigrationState *s)

    g_free(s->hostname);
    s->hostname = NULL;
+    json_writer_free(s->vmdesc);
+    s->vmdesc = NULL;

    qemu_savevm_state_cleanup();

@ -2124,6 +2159,13 @@ bool migration_in_incoming_postcopy(void)
    return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
 }

+bool migration_incoming_postcopy_advised(void)
+{
+    PostcopyState ps = postcopy_state_get();
+
+    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
+}
+
 bool migration_in_bg_snapshot(void)
 {
    MigrationState *s = migrate_get_current();
@ -3778,33 +3820,39 @@ typedef enum {
 */
 static MigIterateState migration_iteration_run(MigrationState *s)
 {
-    uint64_t pending_size, pend_pre, pend_compat, pend_post;
+    uint64_t pend_pre, pend_compat, pend_post;
    bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;

-    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
-                              &pend_compat, &pend_post);
-    pending_size = pend_pre + pend_compat + pend_post;
+    qemu_savevm_state_pending_estimate(&pend_pre, &pend_compat, &pend_post);
+    uint64_t pending_size = pend_pre + pend_compat + pend_post;

-    trace_migrate_pending(pending_size, s->threshold_size,
-                          pend_pre, pend_compat, pend_post);
+    trace_migrate_pending_estimate(pending_size,
+                                   pend_pre, pend_compat, pend_post);

-    if (pending_size && pending_size >= s->threshold_size) {
-        /* Still a significant amount to transfer */
-        if (!in_postcopy && pend_pre <= s->threshold_size &&
-            qatomic_read(&s->start_postcopy)) {
-            if (postcopy_start(s)) {
-                error_report("%s: postcopy failed to start", __func__);
-            }
-            return MIG_ITERATE_SKIP;
-        }
-        /* Just another iteration step */
-        qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
-    } else {
+    if (pend_pre + pend_compat <= s->threshold_size) {
+        qemu_savevm_state_pending_exact(&pend_pre, &pend_compat, &pend_post);
+        pending_size = pend_pre + pend_compat + pend_post;
+        trace_migrate_pending_exact(pending_size,
+                                    pend_pre, pend_compat, pend_post);
+    }
+
+    if (!pending_size || pending_size < s->threshold_size) {
        trace_migration_thread_low_pending(pending_size);
        migration_completion(s);
        return MIG_ITERATE_BREAK;
    }

+    /* Still a significant amount to transfer */
+    if (!in_postcopy && pend_pre <= s->threshold_size &&
+        qatomic_read(&s->start_postcopy)) {
+        if (postcopy_start(s)) {
+            error_report("%s: postcopy failed to start", __func__);
+        }
+        return MIG_ITERATE_SKIP;
+    }
+
+    /* Just another iteration step */
+    qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
    return MIG_ITERATE_RESUME;
 }

@ -3981,10 +4029,13 @@ static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
 static void *migration_thread(void *opaque)
 {
    MigrationState *s = opaque;
+    MigrationThread *thread = NULL;
    int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
    MigThrError thr_error;
    bool urgent = false;

+    thread = MigrationThreadAdd("live_migration", qemu_get_thread_id());
+
    rcu_register_thread();

    object_ref(OBJECT(s));
@ -4061,6 +4112,7 @@ static void *migration_thread(void *opaque)
    migration_iteration_finish(s);
    object_unref(OBJECT(s));
    rcu_unregister_thread();
+    MigrationThreadDel(thread);
    return NULL;
 }

--- a/migration/migration.h
+++ b/migration/migration.h
@ -17,6 +17,7 @@
 #include "exec/cpu-common.h"
 #include "hw/qdev-core.h"
 #include "qapi/qapi-types-migration.h"
+#include "qapi/qmp/json-writer.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine_int.h"
 #include "io/channel.h"
@ -366,6 +367,9 @@ struct MigrationState {
     * This save hostname when out-going migration starts
     */
    char *hostname;
+
+    /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
+    JSONWriter *vmdesc;
 };

 void migrate_set_state(int *state, int old_state, int new_state);
--- a/migration/multifd.c
+++ b/migration/multifd.c
@ -24,6 +24,7 @@
 #include "qemu-file.h"
 #include "trace.h"
 #include "multifd.h"
+#include "threadinfo.h"

 #include "qemu/yank.h"
 #include "io/channel-socket.h"
@ -442,6 +443,7 @@ static int multifd_send_pages(QEMUFile *f)
 int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 {
    MultiFDPages_t *pages = multifd_send_state->pages;
+    bool changed = false;

    if (!pages->block) {
        pages->block = block;
@ -454,14 +456,16 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
        if (pages->num < pages->allocated) {
            return 1;
        }
+    } else {
+        changed = true;
    }

    if (multifd_send_pages(f) < 0) {
        return -1;
    }

-    if (pages->block != block) {
-        return  multifd_queue_page(f, block, offset);
+    if (changed) {
+        return multifd_queue_page(f, block, offset);
    }

    return 1;
@ -627,16 +631,16 @@ int multifd_send_sync_main(QEMUFile *f)
        stat64_add(&ram_atomic_counters.transferred, p->packet_len);
        qemu_mutex_unlock(&p->mutex);
        qemu_sem_post(&p->sem);
-
-        if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
-            return -1;
-        }
    }
    for (i = 0; i < migrate_multifd_channels(); i++) {
        MultiFDSendParams *p = &multifd_send_state->params[i];

        trace_multifd_send_sync_main_wait(p->id);
        qemu_sem_wait(&p->sem_sync);
+
+        if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
+            return -1;
+        }
    }
    trace_multifd_send_sync_main(multifd_send_state->packet_num);

@ -646,10 +650,13 @@ int multifd_send_sync_main(QEMUFile *f)
 static void *multifd_send_thread(void *opaque)
 {
    MultiFDSendParams *p = opaque;
+    MigrationThread *thread = NULL;
    Error *local_err = NULL;
    int ret = 0;
    bool use_zero_copy_send = migrate_use_zero_copy_send();

+    thread = MigrationThreadAdd(p->name, qemu_get_thread_id());
+
    trace_multifd_send_thread_start(p->id);
    rcu_register_thread();

@ -759,6 +766,7 @@ out:
    qemu_mutex_unlock(&p->mutex);

    rcu_unregister_thread();
+    MigrationThreadDel(thread);
    trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages);

    return NULL;
@ -1164,9 +1172,14 @@ int multifd_load_setup(Error **errp)
    uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
    uint8_t i;

-    if (!migrate_use_multifd()) {
+    /*
+     * Return successfully if multiFD recv state is already initialised
+     * or multiFD is not enabled.
+     */
+    if (multifd_recv_state || !migrate_use_multifd()) {
        return 0;
    }
+
    if (!migrate_multi_channels_is_allowed()) {
        error_setg(errp, "multifd is not supported by current protocol");
        return -1;
@ -1227,11 +1240,9 @@ bool multifd_recv_all_channels_created(void)

 /*
 * Try to receive all multifd channels to get ready for the migration.
- * - Return true and do not set @errp when correctly receiving all channels;
- * - Return false and do not set @errp when correctly receiving the current one;
- * - Return false and set @errp when failing to receive the current channel.
+ * Sets @errp when failing to receive the current channel.
 */
-bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
+void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
 {
    MultiFDRecvParams *p;
    Error *local_err = NULL;
@ -1244,7 +1255,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
                                "failed to receive packet"
                                " via multifd channel %d: ",
                                qatomic_read(&multifd_recv_state->count));
-        return false;
+        return;
    }
    trace_multifd_recv_new_channel(id);

@ -1254,7 +1265,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
                   id);
        multifd_recv_terminate_threads(local_err);
        error_propagate(errp, local_err);
-        return false;
+        return;
    }
    p->c = ioc;
    object_ref(OBJECT(ioc));
@ -1265,6 +1276,4 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
    qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
                       QEMU_THREAD_JOINABLE);
    qatomic_inc(&multifd_recv_state->count);
-    return qatomic_read(&multifd_recv_state->count) ==
-           migrate_multifd_channels();
 }
--- a/migration/multifd.c.orig
+++ b/migration/multifd.c.orig
--- a/migration/multifd.h
+++ b/migration/multifd.h
@ -18,7 +18,7 @@ void multifd_save_cleanup(void);
 int multifd_load_setup(Error **errp);
 int multifd_load_cleanup(Error **errp);
 bool multifd_recv_all_channels_created(void);
-bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
+void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
 void multifd_recv_sync_main(void);
 int multifd_send_sync_main(QEMUFile *f);
 int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@ -37,6 +37,7 @@
 #include "qemu-file.h"
 #include "yank_functions.h"
 #include "tls.h"
+#include "qemu/userfaultfd.h"

 /* Arbitrary limit on size of each discard command,
 * keeps them around ~200 bytes
@ -226,11 +227,9 @@ static bool receive_ufd_features(uint64_t *features)
    int ufd;
    bool ret = true;

-    /* if we are here __NR_userfaultfd should exists */
-    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    ufd = uffd_open(O_CLOEXEC);
    if (ufd == -1) {
-        error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
-                     strerror(errno));
+        error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
        return false;
    }

@ -375,7 +374,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
        goto out;
    }

-    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    ufd = uffd_open(O_CLOEXEC);
    if (ufd == -1) {
        error_report("%s: userfaultfd not available: %s", __func__,
                     strerror(errno));
@ -1160,7 +1159,7 @@ static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
 int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
 {
    /* Open the fd for the kernel to give us userfaults */
-    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+    mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
    if (mis->userfault_fd == -1) {
        error_report("%s: Failed to open userfault fd: %s", __func__,
                     strerror(errno));
@ -1539,7 +1538,7 @@ void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
    }
 }

-bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
+void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
 {
    /*
     * The new loading channel has its own threads, so it needs to be
@ -1548,9 +1547,6 @@ bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
    qemu_file_set_blocking(file, true);
    mis->postcopy_qemufile_dst = file;
    trace_postcopy_preempt_new_channel();
-
-    /* Start the migration immediately */
-    return true;
 }

 /*
--- a/migration/postcopy-ram.h
+++ b/migration/postcopy-ram.h
@ -190,7 +190,7 @@ enum PostcopyChannels {
    RAM_CHANNEL_MAX,
 };

-bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
+void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
 int postcopy_preempt_setup(MigrationState *s, Error **errp);
 int postcopy_preempt_wait_channel(MigrationState *s);

--- a/migration/ram.c
+++ b/migration/ram.c
@ -1774,13 +1774,15 @@ out:
 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
                                       ram_addr_t size)
 {
+    const ram_addr_t end = offset + size;
+
    /*
     * We read one byte of each page; this will preallocate page tables if
     * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
     * where no page was populated yet. This might require adaption when
     * supporting other mappings, like shmem.
     */
-    for (; offset < size; offset += block->page_size) {
+    for (; offset < end; offset += block->page_size) {
        char tmp = *((char *)block->host + offset);

        /* Don't optimize the read out */
@ -1863,6 +1865,39 @@ void ram_write_tracking_prepare(void)
    }
 }

+static inline int uffd_protect_section(MemoryRegionSection *section,
+                                       void *opaque)
+{
+    const hwaddr size = int128_get64(section->size);
+    const hwaddr offset = section->offset_within_region;
+    RAMBlock *rb = section->mr->ram_block;
+    int uffd_fd = (uintptr_t)opaque;
+
+    return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
+                                  false);
+}
+
+static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
+{
+    assert(rb->flags & RAM_UF_WRITEPROTECT);
+
+    /* See ram_block_populate_read() */
+    if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
+        RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
+        MemoryRegionSection section = {
+            .mr = rb->mr,
+            .offset_within_region = 0,
+            .size = rb->mr->size,
+        };
+
+        return ram_discard_manager_replay_populated(rdm, &section,
+                                                    uffd_protect_section,
+                                                    (void *)(uintptr_t)uffd_fd);
+    }
+    return uffd_change_protection(uffd_fd, rb->host,
+                                  rb->used_length, true, false);
+}
+
 /*
 * ram_write_tracking_start: start UFFD-WP memory tracking
 *
@ -1894,14 +1929,14 @@ int ram_write_tracking_start(void)
                block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
            goto fail;
        }
-        /* Apply UFFD write protection to the block memory range */
-        if (uffd_change_protection(rs->uffdio_fd, block->host,
-                block->max_length, true, false)) {
-            goto fail;
-        }
        block->flags |= RAM_UF_WRITEPROTECT;
        memory_region_ref(block->mr);

+        /* Apply UFFD write protection to the block memory range */
+        if (ram_block_uffd_protect(block, uffd_fd)) {
+            goto fail;
+        }
+
        trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
                block->host, block->max_length);
    }
@ -1915,12 +1950,6 @@ fail:
        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
            continue;
        }
-        /*
-         * In case some memory block failed to be write-protected
-         * remove protection and unregister all succeeded RAM blocks
-         */
-        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
-                false, false);
        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
        /* Cleanup flags and remove reference */
        block->flags &= ~RAM_UF_WRITEPROTECT;
@ -1946,9 +1975,6 @@ void ram_write_tracking_stop(void)
        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
            continue;
        }
-        /* Remove protection and unregister all affected RAM blocks */
-        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
-                false, false);
        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);

        trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
@ -2319,8 +2345,25 @@ static void pss_host_page_prepare(PageSearchStatus *pss)
    size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;

    pss->host_page_sending = true;
-    pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
-    pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
+    if (guest_pfns <= 1) {
+        /*
+         * This covers both when guest psize == host psize, or when guest
+         * has larger psize than the host (guest_pfns==0).
+         *
+         * For the latter, we always send one whole guest page per
+         * iteration of the host page (example: an Alpha VM on x86 host
+         * will have guest psize 8K while host psize 4K).
+         */
+        pss->host_page_start = pss->page;
+        pss->host_page_end = pss->page + 1;
+    } else {
+        /*
+         * The host page spans over multiple guest pages, we send them
+         * within the same host page iteration.
+         */
+        pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
+        pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
+    }
 }

 /*
@ -3392,19 +3435,35 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
    return 0;
 }

-static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
-                             uint64_t *res_precopy_only,
-                             uint64_t *res_compatible,
-                             uint64_t *res_postcopy_only)
+static void ram_state_pending_estimate(void *opaque,
+                                       uint64_t *res_precopy_only,
+                                       uint64_t *res_compatible,
+                                       uint64_t *res_postcopy_only)
 {
    RAMState **temp = opaque;
    RAMState *rs = *temp;
-    uint64_t remaining_size;

-    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
+    uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;

-    if (!migration_in_postcopy() &&
-        remaining_size < max_size) {
+    if (migrate_postcopy_ram()) {
+        /* We can do postcopy, and all the data is postcopiable */
+        *res_postcopy_only += remaining_size;
+    } else {
+        *res_precopy_only += remaining_size;
+    }
+}
+
+static void ram_state_pending_exact(void *opaque,
+                                    uint64_t *res_precopy_only,
+                                    uint64_t *res_compatible,
+                                    uint64_t *res_postcopy_only)
+{
+    RAMState **temp = opaque;
+    RAMState *rs = *temp;
+
+    uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
+
+    if (!migration_in_postcopy()) {
        qemu_mutex_lock_iothread();
        WITH_RCU_READ_LOCK_GUARD() {
            migration_bitmap_sync_precopy(rs);
@ -4091,12 +4150,6 @@ int ram_load_postcopy(QEMUFile *f, int channel)
    return ret;
 }

-static bool postcopy_is_advised(void)
-{
-    PostcopyState ps = postcopy_state_get();
-    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
-}
-
 static bool postcopy_is_running(void)
 {
    PostcopyState ps = postcopy_state_get();
@ -4167,7 +4220,7 @@ static int ram_load_precopy(QEMUFile *f)
    MigrationIncomingState *mis = migration_incoming_get_current();
    int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
    /* ADVISE is earlier, it shows the source has the postcopy capability on */
-    bool postcopy_advised = postcopy_is_advised();
+    bool postcopy_advised = migration_incoming_postcopy_advised();
    if (!migrate_use_compression()) {
        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
    }
@ -4560,7 +4613,8 @@ static SaveVMHandlers savevm_ram_handlers = {
    .save_live_complete_postcopy = ram_save_complete,
    .save_live_complete_precopy = ram_save_complete,
    .has_postcopy = ram_has_postcopy,
-    .save_live_pending = ram_save_pending,
+    .state_pending_exact = ram_state_pending_exact,
+    .state_pending_estimate = ram_state_pending_estimate,
    .load_state = ram_load,
    .save_cleanup = ram_save_cleanup,
    .load_setup = ram_load_setup,
--- a/migration/rdma.c
+++ b/migration/rdma.c
@ -2785,7 +2785,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
    rdma = qatomic_rcu_read(&rioc->rdmaout);

    if (!rdma) {
-        return -EIO;
+        error_setg(errp, "RDMA control channel output is not set");
+        return -1;
    }

    CHECK_ERROR_STATE();
@ -2797,7 +2798,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
    ret = qemu_rdma_write_flush(f, rdma);
    if (ret < 0) {
        rdma->error_state = ret;
-        return ret;
+        error_setg(errp, "qemu_rdma_write_flush returned %d", ret);
+        return -1;
    }

    for (i = 0; i < niov; i++) {
@ -2816,7 +2818,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,

            if (ret < 0) {
                rdma->error_state = ret;
-                return ret;
+                error_setg(errp, "qemu_rdma_exchange_send returned %d", ret);
+                return -1;
            }

            data += len;
@ -2854,6 +2857,7 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
                                      size_t niov,
                                      int **fds,
                                      size_t *nfds,
+                                      int flags,
                                      Error **errp)
 {
    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
@ -2867,7 +2871,8 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
    rdma = qatomic_rcu_read(&rioc->rdmain);

    if (!rdma) {
-        return -EIO;
+        error_setg(errp, "RDMA control channel input is not set");
+        return -1;
    }

    CHECK_ERROR_STATE();
@ -2903,7 +2908,8 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,

        if (ret < 0) {
            rdma->error_state = ret;
-            return ret;
+            error_setg(errp, "qemu_rdma_exchange_recv returned %d", ret);
+            return -1;
        }

        /*
--- a/migration/savevm.c
+++ b/migration/savevm.c
@ -42,7 +42,6 @@
 #include "postcopy-ram.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-migration.h"
-#include "qapi/qmp/json-writer.h"
 #include "qapi/clone-visitor.h"
 #include "qapi/qapi-builtin-visit.h"
 #include "qapi/qmp/qerror.h"
@ -67,6 +66,7 @@
 #include "net/announce.h"
 #include "qemu/yank.h"
 #include "yank_functions.h"
+#include "sysemu/qtest.h"

 const unsigned int postcopy_ram_discard_version;

@ -586,6 +586,7 @@ static void dump_vmstate_vmsd(FILE *out_file,
            field++;
            first = false;
        }
+        assert(field->flags == VMS_END);
        fprintf(out_file, "\n%*s]", indent, "");
    }
    if (vmsd->subsections != NULL) {
@ -804,6 +805,42 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
    }
 }

+/*
+ * Perform some basic checks on vmsd's at registration
+ * time.
+ */
+static void vmstate_check(const VMStateDescription *vmsd)
+{
+    const VMStateField *field = vmsd->fields;
+    const VMStateDescription **subsection = vmsd->subsections;
+
+    if (field) {
+        while (field->name) {
+            if (field->flags & (VMS_STRUCT | VMS_VSTRUCT)) {
+                /* Recurse to sub structures */
+                vmstate_check(field->vmsd);
+            }
+            /* Carry on */
+            field++;
+        }
+        /* Check for the end of field list canary */
+        if (field->flags != VMS_END) {
+            error_report("VMSTATE not ending with VMS_END: %s", vmsd->name);
+            g_assert_not_reached();
+        }
+    }
+
+    while (subsection && *subsection) {
+        /*
+         * The name of a subsection should start with the name of the
+         * current object.
+         */
+        assert(!strncmp(vmsd->name, (*subsection)->name, strlen(vmsd->name)));
+        vmstate_check(*subsection);
+        subsection++;
+    }
+}
+
 int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
                                   const VMStateDescription *vmsd,
                                   void *opaque, int alias_id,
@ -849,6 +886,11 @@ int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
    } else {
        se->instance_id = instance_id;
    }
+
+    /* Perform a recursive sanity check during the test runs */
+    if (qtest_enabled()) {
+        vmstate_check(vmsd);
+    }
    assert(!se->compat || se->instance_id == 0);
    savevm_state_handler_insert(se);
    return 0;
@ -898,17 +940,6 @@ static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
    }
 }

-static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
-                        JSONWriter *vmdesc)
-{
-    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
-    if (!se->vmsd) {
-        vmstate_save_old_style(f, se, vmdesc);
-        return 0;
-    }
-    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
-}
-
 /*
 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 */
@ -942,6 +973,43 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
    }
 }

+static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc)
+{
+    int ret;
+
+    if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
+        return 0;
+    }
+    if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
+        trace_savevm_section_skip(se->idstr, se->section_id);
+        return 0;
+    }
+
+    trace_savevm_section_start(se->idstr, se->section_id);
+    save_section_header(f, se, QEMU_VM_SECTION_FULL);
+    if (vmdesc) {
+        json_writer_start_object(vmdesc, NULL);
+        json_writer_str(vmdesc, "name", se->idstr);
+        json_writer_int64(vmdesc, "instance_id", se->instance_id);
+    }
+
+    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
+    if (!se->vmsd) {
+        vmstate_save_old_style(f, se, vmdesc);
+    } else {
+        ret = vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    trace_savevm_section_end(se->idstr, se->section_id, 0);
+    save_section_footer(f, se);
+    if (vmdesc) {
+        json_writer_end_object(vmdesc);
+    }
+    return 0;
+}
 /**
 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 *                           command and associated data.
@ -1164,12 +1232,27 @@ bool qemu_savevm_state_guest_unplug_pending(void)

 void qemu_savevm_state_setup(QEMUFile *f)
 {
+    MigrationState *ms = migrate_get_current();
    SaveStateEntry *se;
    Error *local_err = NULL;
    int ret;

+    ms->vmdesc = json_writer_new(false);
+    json_writer_start_object(ms->vmdesc, NULL);
+    json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size());
+    json_writer_start_array(ms->vmdesc, "devices");
+
    trace_savevm_state_setup();
    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+        if (se->vmsd && se->vmsd->early_setup) {
+            ret = vmstate_save(f, se, ms->vmdesc);
+            if (ret) {
+                qemu_file_set_error(f, ret);
+                break;
+            }
+            continue;
+        }
+
        if (!se->ops || !se->ops->save_setup) {
            continue;
        }
@ -1365,41 +1448,23 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
                                                    bool in_postcopy,
                                                    bool inactivate_disks)
 {
-    g_autoptr(JSONWriter) vmdesc = NULL;
+    MigrationState *ms = migrate_get_current();
+    JSONWriter *vmdesc = ms->vmdesc;
    int vmdesc_len;
    SaveStateEntry *se;
    int ret;

-    vmdesc = json_writer_new(false);
-    json_writer_start_object(vmdesc, NULL);
-    json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
-    json_writer_start_array(vmdesc, "devices");
    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
-
-        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
-            continue;
-        }
-        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
-            trace_savevm_section_skip(se->idstr, se->section_id);
+        if (se->vmsd && se->vmsd->early_setup) {
+            /* Already saved during qemu_savevm_state_setup(). */
            continue;
        }

-        trace_savevm_section_start(se->idstr, se->section_id);
-
-        json_writer_start_object(vmdesc, NULL);
-        json_writer_str(vmdesc, "name", se->idstr);
-        json_writer_int64(vmdesc, "instance_id", se->instance_id);
-
-        save_section_header(f, se, QEMU_VM_SECTION_FULL);
        ret = vmstate_save(f, se, vmdesc);
        if (ret) {
            qemu_file_set_error(f, ret);
            return ret;
        }
-        trace_savevm_section_end(se->idstr, se->section_id, 0);
-        save_section_footer(f, se);
-
-        json_writer_end_object(vmdesc);
    }

    if (inactivate_disks) {
@ -1428,6 +1493,10 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
        qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
    }

+    /* Free it now to detect any inconsistencies. */
+    json_writer_free(vmdesc);
+    ms->vmdesc = NULL;
+
    return 0;
 }

@ -1472,10 +1541,9 @@ flush:
 * the result is split into the amount for units that can and
 * for units that can't do postcopy.
 */
-void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
-                               uint64_t *res_precopy_only,
-                               uint64_t *res_compatible,
-                               uint64_t *res_postcopy_only)
+void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only,
+                                        uint64_t *res_compatible,
+                                        uint64_t *res_postcopy_only)
 {
    SaveStateEntry *se;

@ -1483,9 +1551,8 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
    *res_compatible = 0;
    *res_postcopy_only = 0;

-
    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
-        if (!se->ops || !se->ops->save_live_pending) {
+        if (!se->ops || !se->ops->state_pending_exact) {
            continue;
        }
        if (se->ops->is_active) {
@ -1493,9 +1560,34 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
                continue;
            }
        }
-        se->ops->save_live_pending(f, se->opaque, threshold_size,
-                                   res_precopy_only, res_compatible,
-                                   res_postcopy_only);
+        se->ops->state_pending_exact(se->opaque,
+                                     res_precopy_only, res_compatible,
+                                     res_postcopy_only);
+    }
+}
+
+void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only,
+                                     uint64_t *res_compatible,
+                                     uint64_t *res_postcopy_only)
+{
+    SaveStateEntry *se;
+
+    *res_precopy_only = 0;
+    *res_compatible = 0;
+    *res_postcopy_only = 0;
+
+    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+        if (!se->ops || !se->ops->state_pending_estimate) {
+            continue;
+        }
+        if (se->ops->is_active) {
+            if (!se->ops->is_active(se->opaque)) {
+                continue;
+            }
+        }
+        se->ops->state_pending_estimate(se->opaque,
+                                        res_precopy_only, res_compatible,
+                                        res_postcopy_only);
    }
 }

@ -1595,21 +1687,10 @@ int qemu_save_device_state(QEMUFile *f)
        if (se->is_ram) {
            continue;
        }
-        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
-            continue;
-        }
-        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
-            continue;
-        }
-
-        save_section_header(f, se, QEMU_VM_SECTION_FULL);
-
        ret = vmstate_save(f, se, NULL);
        if (ret) {
            return ret;
        }
-
-        save_section_footer(f, se);
    }

    qemu_put_byte(f, QEMU_VM_EOF);
--- a/migration/savevm.h
+++ b/migration/savevm.h
@ -40,10 +40,12 @@ void qemu_savevm_state_cleanup(void);
 void qemu_savevm_state_complete_postcopy(QEMUFile *f);
 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
                                       bool inactivate_disks);
-void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
-                               uint64_t *res_precopy_only,
-                               uint64_t *res_compatible,
-                               uint64_t *res_postcopy_only);
+void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only,
+                                     uint64_t *res_compatible,
+                                     uint64_t *res_postcopy_only);
+void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only,
+                                        uint64_t *res_compatible,
+                                        uint64_t *res_postcopy_only);
 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
 void qemu_savevm_send_open_return_path(QEMUFile *f);
 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
--- a/migration/threadinfo.c
+++ b/migration/threadinfo.c
@ -0,0 +1,51 @@
+/*
+ *  Migration Threads info
+ *
+ *  Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ *  Authors:
+ *  Jiang Jiacheng <jiangjiacheng@huawei.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2 or later.
+ *  See the COPYING file in the top-level directory.
+ */
+
+#include "threadinfo.h"
+
+static QLIST_HEAD(, MigrationThread) migration_threads;
+
+MigrationThread *MigrationThreadAdd(const char *name, int thread_id)
+{
+    MigrationThread *thread =  g_new0(MigrationThread, 1);
+    thread->name = name;
+    thread->thread_id = thread_id;
+
+    QLIST_INSERT_HEAD(&migration_threads, thread, node);
+
+    return thread;
+}
+
+void MigrationThreadDel(MigrationThread *thread)
+{
+    if (thread) {
+        QLIST_REMOVE(thread, node);
+        g_free(thread);
+    }
+}
+
+MigrationThreadInfoList *qmp_query_migrationthreads(Error **errp)
+{
+    MigrationThreadInfoList *head = NULL;
+    MigrationThreadInfoList **tail = &head;
+    MigrationThread *thread = NULL;
+
+    QLIST_FOREACH(thread, &migration_threads, node) {
+        MigrationThreadInfo *info = g_new0(MigrationThreadInfo, 1);
+        info->name = g_strdup(thread->name);
+        info->thread_id = thread->thread_id;
+
+        QAPI_LIST_APPEND(tail, info);
+    }
+
+    return head;
+}
--- a/migration/threadinfo.h
+++ b/migration/threadinfo.h
@ -0,0 +1,28 @@
+/*
+ *  Migration Threads info
+ *
+ *  Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ *  Authors:
+ *  Jiang Jiacheng <jiangjiacheng@huawei.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2 or later.
+ *  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/queue.h"
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-migration.h"
+
+typedef struct MigrationThread MigrationThread;
+
+struct MigrationThread {
+    const char *name; /* the name of migration thread */
+    int thread_id; /* ID of the underlying host thread */
+    QLIST_ENTRY(MigrationThread) node;
+};
+
+MigrationThread *MigrationThreadAdd(const char *name, int thread_id);
+
+void MigrationThreadDel(MigrationThread *info);
--- a/migration/trace-events
+++ b/migration/trace-events
@ -150,7 +150,8 @@ migrate_fd_cleanup(void) ""
 migrate_fd_error(const char *error_desc) "error=%s"
 migrate_fd_cancel(void) ""
 migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
-migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
+migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
+migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
 migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
 migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64
 migration_completion_file_err(void) ""
@ -330,7 +331,7 @@ send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uin
 dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
 dirty_bitmap_save_complete_enter(void) ""
 dirty_bitmap_save_complete_finish(void) ""
-dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64
+dirty_bitmap_state_pending(uint64_t pending) "pending %" PRIu64
 dirty_bitmap_load_complete(void) ""
 dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
 dirty_bitmap_load_bits_zeroes(void) ""
@ -355,7 +356,7 @@ migration_block_save_device_dirty(int64_t sector) "Error reading sector %" PRId6
 migration_block_flush_blks(const char *action, int submitted, int read_done, int transferred) "%s submitted %d read_done %d transferred %d"
 migration_block_save(const char *mig_stage, int submitted, int transferred) "Enter save live %s submitted %d transferred %d"
 migration_block_save_complete(void) "Block migration completed"
-migration_block_save_pending(uint64_t pending) "Enter save live pending  %" PRIu64
+migration_block_state_pending(uint64_t pending) "Enter save live pending  %" PRIu64

 # page_cache.c
 migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@ -154,6 +154,7 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
        }
        field++;
    }
+    assert(field->flags == VMS_END);
    ret = vmstate_subsection_load(f, vmsd, opaque);
    if (ret != 0) {
        return ret;
@ -408,6 +409,7 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
        }
        field++;
    }
+    assert(field->flags == VMS_END);

    if (vmdesc) {
        json_writer_end_array(vmdesc);
--- a/qapi/migration.json
+++ b/qapi/migration.json
@ -1958,6 +1958,35 @@
 { 'command': 'query-vcpu-dirty-limit',
  'returns': [ 'DirtyLimitInfo' ] }

+##
+# @MigrationThreadInfo:
+#
+# Information about migrationthreads
+#
+# @name: the name of migration thread
+#
+# @thread-id: ID of the underlying host thread
+#
+# Since: 7.2
+##
+{ 'struct': 'MigrationThreadInfo',
+  'data': {'name': 'str',
+           'thread-id': 'int'} }
+
+##
+# @query-migrationthreads:
+#
+# Returns information of migration threads
+#
+# data: migration thread name
+#
+# returns: information about migration threads
+#
+# Since: 7.2
+##
+{ 'command': 'query-migrationthreads',
+  'returns': ['MigrationThreadInfo'] }
+
 ##
 # @snapshot-save:
 #
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@ -614,7 +614,7 @@ static int coroutine_fn prh_read(PRHelperClient *client, void *buf, int sz,
        iov.iov_base = buf;
        iov.iov_len = sz;
        n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1,
-                                        &fds, &nfds, errp);
+                                        &fds, &nfds, 0, errp);

        if (n_read == QIO_CHANNEL_ERR_BLOCK) {
            qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN);
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@ -61,14 +61,14 @@ static bool uffd_feature_thread_id;
 #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
 #include <sys/eventfd.h>
 #include <sys/ioctl.h>
-#include <linux/userfaultfd.h>
+#include "qemu/userfaultfd.h"

 static bool ufd_version_check(void)
 {
    struct uffdio_api api_struct;
    uint64_t ioctl_mask;

-    int ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    int ufd = uffd_open(O_CLOEXEC);

    if (ufd == -1) {
        g_test_message("Skipping test: userfaultfd not available");
--- a/tests/qtest/tpm-emu.c
+++ b/tests/qtest/tpm-emu.c
@ -115,7 +115,7 @@ void *tpm_emu_ctrl_thread(void *data)
        int *pfd = NULL;
        size_t nfd = 0;

-        qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, &error_abort);
+        qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, 0, &error_abort);
        cmd = be32_to_cpu(cmd);
        g_assert_cmpint(cmd, ==, CMD_SET_DATAFD);
        g_assert_cmpint(nfd, ==, 1);
--- a/tests/unit/test-io-channel-socket.c
+++ b/tests/unit/test-io-channel-socket.c
@ -460,6 +460,7 @@ static void test_io_channel_unix_fd_pass(void)
                           G_N_ELEMENTS(iorecv),
                           &fdrecv,
                           &nfdrecv,
+                           0,
                           &error_abort);

    g_assert(nfdrecv == G_N_ELEMENTS(fdsend));
--- a/util/userfaultfd.c
+++ b/util/userfaultfd.c
@ -19,6 +19,15 @@
 #include <sys/syscall.h>
 #include <sys/ioctl.h>

+int uffd_open(int flags)
+{
+#if defined(__NR_userfaultfd)
+    return syscall(__NR_userfaultfd, flags);
+#else
+    return -EINVAL;
+#endif
+}
+
 /**
 * uffd_query_features: query UFFD features
 *
@ -32,7 +41,7 @@ int uffd_query_features(uint64_t *features)
    struct uffdio_api api_struct = { 0 };
    int ret = -1;

-    uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    uffd_fd = uffd_open(O_CLOEXEC);
    if (uffd_fd < 0) {
        trace_uffd_query_features_nosys(errno);
        return -1;
@ -69,7 +78,7 @@ int uffd_create_fd(uint64_t features, bool non_blocking)
    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);

    flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
-    uffd_fd = syscall(__NR_userfaultfd, flags);
+    uffd_fd = uffd_open(flags);
    if (uffd_fd < 0) {
        trace_uffd_create_fd_nosys(errno);
        return -1;
--- a/util/vhost-user-server.c
+++ b/util/vhost-user-server.c
@ -116,7 +116,7 @@ vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
         * qio_channel_readv_full may have short reads, keeping calling it
         * until getting VHOST_USER_HDR_SIZE or 0 bytes in total
         */
-        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
+        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, 0, &local_err);
        if (rc < 0) {
            if (rc == QIO_CHANNEL_ERR_BLOCK) {
                assert(local_err == NULL);