diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 29ffe5075e..c2265436ac 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - NULL); + 0, NULL); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - NULL); + 0, NULL); } if (msgfds_num) { diff --git a/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak b/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak new file mode 100644 index 0000000000..ee2bb8c5c9 --- /dev/null +++ b/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak @@ -0,0 +1,7 @@ +# Boards: +# +CONFIG_ISAPC=n +CONFIG_I440FX=n +CONFIG_Q35=n +CONFIG_MICROVM=y + diff --git a/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak b/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak new file mode 100644 index 0000000000..f7e4dae842 --- /dev/null +++ b/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak @@ -0,0 +1,6 @@ +# Boards: +# +CONFIG_ISAPC=y +CONFIG_I440FX=y +CONFIG_Q35=y +CONFIG_MICROVM=y diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst index 3e9656d8e0..6f65c23b47 100644 --- a/docs/devel/migration.rst +++ b/docs/devel/migration.rst @@ -482,15 +482,17 @@ An iterative device must provide: - A ``load_setup`` function that initialises the data structures on the destination. - - A ``save_live_pending`` function that is called repeatedly and must - indicate how much more data the iterative data must save. The core - migration code will use this to determine when to pause the CPUs - and complete the migration. + - A ``state_pending_exact`` function that indicates how much more + data we must save. The core migration code will use this to + determine when to pause the CPUs and complete the migration. - - A ``save_live_iterate`` function (called after ``save_live_pending`` - when there is significant data still to be sent). It should send - a chunk of data until the point that stream bandwidth limits tell it - to stop. Each call generates one section. + - A ``state_pending_estimate`` function that indicates how much more + data we must save. When the estimated amount is smaller than the + threshold, we call ``state_pending_exact``. + + - A ``save_live_iterate`` function should send a chunk of data until + the point that stream bandwidth limits tell it to stop. Each call + generates one section. - A ``save_live_complete_precopy`` function that must transmit the last section for the device containing any remaining data. diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst index 9ff6163c88..673057c90d 100644 --- a/docs/devel/vfio-migration.rst +++ b/docs/devel/vfio-migration.rst @@ -28,7 +28,7 @@ VFIO implements the device hooks for the iterative approach as follows: * A ``load_setup`` function that sets up the migration region on the destination and sets _RESUMING flag in the VFIO device state. -* A ``save_live_pending`` function that reads pending_bytes from the vendor +* A ``state_pending_exact`` function that reads pending_bytes from the vendor driver, which indicates the amount of data that the vendor driver has yet to save for the VFIO device. @@ -114,7 +114,7 @@ Live migration save path (RUNNING, _SETUP, _RUNNING|_SAVING) | (RUNNING, _ACTIVE, _RUNNING|_SAVING) - If device is active, get pending_bytes by .save_live_pending() + If device is active, get pending_bytes by .state_pending_exact() If total pending_bytes >= threshold_size, call .save_live_iterate() Data of VFIO device for pre-copy phase is copied Iterate till total pending bytes converge and are less than threshold diff --git a/hw/core/machine.c b/hw/core/machine.c index f7761baab5..b5cd42cd8c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -41,7 +41,9 @@ #include "hw/virtio/virtio-pci.h" #include "qom/object_interfaces.h" -GlobalProperty hw_compat_7_2[] = {}; +GlobalProperty hw_compat_7_2[] = { + { "virtio-mem", "x-early-migration", "false" }, +}; const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); GlobalProperty hw_compat_7_1[] = { diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c index 9eda1c3b2a..3e32002eab 100644 --- a/hw/s390x/s390-stattrib.c +++ b/hw/s390x/s390-stattrib.c @@ -182,10 +182,10 @@ static int cmma_save_setup(QEMUFile *f, void *opaque) return 0; } -static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only) +static void cmma_state_pending(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) { S390StAttribState *sas = S390_STATTRIB(opaque); S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas); @@ -371,7 +371,8 @@ static SaveVMHandlers savevm_s390_stattrib_handlers = { .save_setup = cmma_save_setup, .save_live_iterate = cmma_save_iterate, .save_live_complete_precopy = cmma_save_complete, - .save_live_pending = cmma_save_pending, + .state_pending_exact = cmma_state_pending, + .state_pending_estimate = cmma_state_pending, .save_cleanup = cmma_save_cleanup, .load_state = cmma_load, .is_active = cmma_active, diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index c74453e0b5..b3318f0f20 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -456,11 +456,10 @@ static void vfio_save_cleanup(void *opaque) trace_vfio_save_cleanup(vbasedev->name); } -static void vfio_save_pending(QEMUFile *f, void *opaque, - uint64_t threshold_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only) +static void vfio_state_pending(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) { VFIODevice *vbasedev = opaque; VFIOMigration *migration = vbasedev->migration; @@ -473,7 +472,7 @@ static void vfio_save_pending(QEMUFile *f, void *opaque, *res_precopy_only += migration->pending_bytes; - trace_vfio_save_pending(vbasedev->name, *res_precopy_only, + trace_vfio_state_pending(vbasedev->name, *res_precopy_only, *res_postcopy_only, *res_compatible); } @@ -515,9 +514,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque) } /* - * Reset pending_bytes as .save_live_pending is not called during savevm or - * snapshot case, in such case vfio_update_pending() at the start of this - * function updates pending_bytes. + * Reset pending_bytes as state_pending* are not called during + * savevm or snapshot case, in such case vfio_update_pending() at + * the start of this function updates pending_bytes. */ migration->pending_bytes = 0; trace_vfio_save_iterate(vbasedev->name, data_size); @@ -685,7 +684,8 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) static SaveVMHandlers savevm_vfio_handlers = { .save_setup = vfio_save_setup, .save_cleanup = vfio_save_cleanup, - .save_live_pending = vfio_save_pending, + .state_pending_exact = vfio_state_pending, + .state_pending_estimate = vfio_state_pending, .save_live_iterate = vfio_save_iterate, .save_live_complete_precopy = vfio_save_complete_precopy, .save_state = vfio_save_state, diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 73dffe9e00..52de1c84f8 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -157,7 +157,7 @@ vfio_save_cleanup(const char *name) " (%s)" vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64 vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64 vfio_save_device_config_state(const char *name) " (%s)" -vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64 +vfio_state_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64 vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d" vfio_save_complete_precopy(const char *name) " (%s)" vfio_load_device_config_state(const char *name) " (%s)" diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 1ed1f5a4af..957fe77dc0 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -31,6 +31,8 @@ #include CONFIG_DEVICES #include "trace.h" +static const VMStateDescription vmstate_virtio_mem_device_early; + /* * We only had legacy x86 guests that did not support * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests. @@ -202,6 +204,30 @@ static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, return ret; } +static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg, + virtio_mem_range_cb cb) +{ + unsigned long first_bit, last_bit; + uint64_t offset, size; + int ret = 0; + + first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size); + while (first_bit < vmem->bitmap_size) { + offset = first_bit * vmem->block_size; + last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + first_bit + 1) - 1; + size = (last_bit - first_bit + 1) * vmem->block_size; + + ret = cb(vmem, arg, offset, size); + if (ret) { + break; + } + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + last_bit + 2); + } + return ret; +} + /* * Adjust the memory section to cover the intersection with the given range. * @@ -772,6 +798,12 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) error_setg(errp, "'%s' property specifies an unsupported memdev", VIRTIO_MEM_MEMDEV_PROP); return; + } else if (vmem->memdev->prealloc) { + error_setg(errp, "'%s' property specifies a memdev with preallocation" + " enabled: %s. Instead, specify 'prealloc=on' for the" + " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP, + object_get_canonical_path_component(OBJECT(vmem->memdev))); + return; } if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) || @@ -872,6 +904,10 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) host_memory_backend_set_mapped(vmem->memdev, true); vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); + if (vmem->early_migration) { + vmstate_register(VMSTATE_IF(vmem), VMSTATE_INSTANCE_ID_ANY, + &vmstate_virtio_mem_device_early, vmem); + } qemu_register_reset(virtio_mem_system_reset, vmem); /* @@ -893,6 +929,10 @@ static void virtio_mem_device_unrealize(DeviceState *dev) */ memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); qemu_unregister_reset(virtio_mem_system_reset, vmem); + if (vmem->early_migration) { + vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early, + vmem); + } vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem)); host_memory_backend_set_mapped(vmem->memdev, false); virtio_del_queue(vdev, 0); @@ -922,6 +962,10 @@ static int virtio_mem_post_load(void *opaque, int version_id) RamDiscardListener *rdl; int ret; + if (vmem->prealloc && !vmem->early_migration) { + warn_report("Proper preallocation with migration requires a newer QEMU machine"); + } + /* * We started out with all memory discarded and our memory region is mapped * into an address space. Replay, now that we updated the bitmap. @@ -941,6 +985,64 @@ static int virtio_mem_post_load(void *opaque, int version_id) return virtio_mem_restore_unplugged(vmem); } +static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) +{ + void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset; + int fd = memory_region_get_fd(&vmem->memdev->mr); + Error *local_err = NULL; + + qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err); + if (local_err) { + error_report_err(local_err); + return -ENOMEM; + } + return 0; +} + +static int virtio_mem_post_load_early(void *opaque, int version_id) +{ + VirtIOMEM *vmem = VIRTIO_MEM(opaque); + RAMBlock *rb = vmem->memdev->mr.ram_block; + int ret; + + if (!vmem->prealloc) { + return 0; + } + + /* + * We restored the bitmap and verified that the basic properties + * match on source and destination, so we can go ahead and preallocate + * memory for all plugged memory blocks, before actual RAM migration starts + * touching this memory. + */ + ret = virtio_mem_for_each_plugged_range(vmem, NULL, + virtio_mem_prealloc_range_cb); + if (ret) { + return ret; + } + + /* + * This is tricky: postcopy wants to start with a clean slate. On + * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily + * preallocated) RAM such that postcopy will work as expected later. + * + * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual + * RAM migration. So let's discard all memory again. This looks like an + * expensive NOP, but actually serves a purpose: we made sure that we + * were able to allocate all required backend memory once. We cannot + * guarantee that the backend memory we will free will remain free + * until we need it during postcopy, but at least we can catch the + * obvious setup issues this way. + */ + if (migration_incoming_postcopy_advised()) { + if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) { + return -EBUSY; + } + } + return 0; +} + typedef struct VirtIOMEMMigSanityChecks { VirtIOMEM *parent; uint64_t addr; @@ -1009,18 +1111,54 @@ static const VMStateDescription vmstate_virtio_mem_sanity_checks = { }, }; +static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(opaque); + + /* With early migration, these fields were already migrated. */ + return !vmem->early_migration; +} + static const VMStateDescription vmstate_virtio_mem_device = { .name = "virtio-mem-device", .minimum_version_id = 1, .version_id = 1, .priority = MIG_PRI_VIRTIO_MEM, .post_load = virtio_mem_post_load, + .fields = (VMStateField[]) { + VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists, + VirtIOMEMMigSanityChecks, + vmstate_virtio_mem_sanity_checks), + VMSTATE_UINT64(usable_region_size, VirtIOMEM), + VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists), + VMSTATE_UINT64(requested_size, VirtIOMEM), + VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists, + 0, bitmap_size), + VMSTATE_END_OF_LIST() + }, +}; + +/* + * Transfer properties that are immutable while migration is active early, + * such that we have have this information around before migrating any RAM + * content. + * + * Note that virtio_mem_is_busy() makes sure these properties can no longer + * change on the migration source until migration completed. + * + * With QEMU compat machines, we transmit these properties later, via + * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists(). + */ +static const VMStateDescription vmstate_virtio_mem_device_early = { + .name = "virtio-mem-device-early", + .minimum_version_id = 1, + .version_id = 1, + .early_setup = true, + .post_load = virtio_mem_post_load_early, .fields = (VMStateField[]) { VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks, vmstate_virtio_mem_sanity_checks), - VMSTATE_UINT64(usable_region_size, VirtIOMEM), VMSTATE_UINT64(size, VirtIOMEM), - VMSTATE_UINT64(requested_size, VirtIOMEM), VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size), VMSTATE_END_OF_LIST() }, @@ -1205,6 +1343,8 @@ static Property virtio_mem_properties[] = { DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM, unplugged_inaccessible, ON_OFF_AUTO_AUTO), #endif + DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM, + early_migration, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h index 7745cfc1a3..f15e561785 100644 --- a/include/hw/virtio/virtio-mem.h +++ b/include/hw/virtio/virtio-mem.h @@ -31,6 +31,7 @@ OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass, #define VIRTIO_MEM_BLOCK_SIZE_PROP "block-size" #define VIRTIO_MEM_ADDR_PROP "memaddr" #define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "unplugged-inaccessible" +#define VIRTIO_MEM_EARLY_MIGRATION_PROP "x-early-migration" #define VIRTIO_MEM_PREALLOC_PROP "prealloc" struct VirtIOMEM { @@ -74,6 +75,13 @@ struct VirtIOMEM { /* whether to prealloc memory when plugging new blocks */ bool prealloc; + /* + * Whether we migrate properties that are immutable while migration is + * active early, before state of other devices and especially, before + * migrating any RAM content. + */ + bool early_migration; + /* notifiers to notify when "size" changes */ NotifierList size_change_notifiers; diff --git a/include/io/channel.h b/include/io/channel.h index 78b15f7870..153fbd2904 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1 + typedef enum QIOChannelFeature QIOChannelFeature; enum QIOChannelFeature { @@ -41,6 +43,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_SHUTDOWN, QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, + QIO_CHANNEL_FEATURE_READ_MSG_PEEK, }; @@ -114,6 +117,7 @@ struct QIOChannelClass { size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); @@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc, * @niov: the length of the @iov array * @fds: pointer to an array that will received file handles * @nfds: pointer filled with number of elements in @fds on return + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*) * @errp: pointer to a NULL-initialized error object * * Read data from the IO channel, storing it in the @@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); diff --git a/include/migration/misc.h b/include/migration/misc.h index 465906710d..8b49841016 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -67,8 +67,10 @@ bool migration_has_failed(MigrationState *); /* ...and after the device transmission */ bool migration_in_postcopy_after_devices(MigrationState *); void migration_global_dump(Monitor *mon); -/* True if incomming migration entered POSTCOPY_INCOMING_DISCARD */ +/* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */ bool migration_in_incoming_postcopy(void); +/* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */ +bool migration_incoming_postcopy_advised(void); /* True if background snapshot is active */ bool migration_in_bg_snapshot(void); diff --git a/include/migration/register.h b/include/migration/register.h index c1dcff0f90..b91a0cdbf8 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -46,11 +46,6 @@ typedef struct SaveVMHandlers { /* This runs outside the iothread lock! */ int (*save_setup)(QEMUFile *f, void *opaque); - void (*save_live_pending)(QEMUFile *f, void *opaque, - uint64_t threshold_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only); /* Note for save_live_pending: * - res_precopy_only is for data which must be migrated in precopy phase * or in stopped state, in other words - before target vm start @@ -61,8 +56,16 @@ typedef struct SaveVMHandlers { * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the * whole amount of pending data. */ - - + /* This estimates the remaining data to transfer */ + void (*state_pending_estimate)(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only); + /* This calculate the exact remaining data to transfer */ + void (*state_pending_exact)(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only); LoadStateHandler *load_state; int (*load_setup)(QEMUFile *f, void *opaque); int (*load_cleanup)(void *opaque); diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index ad24aa1934..084f5e784a 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -147,6 +147,9 @@ enum VMStateFlags { * VMStateField.struct_version_id to tell which version of the * structure we are referencing to use. */ VMS_VSTRUCT = 0x8000, + + /* Marker for end of list */ + VMS_END = 0x10000 }; typedef enum { @@ -178,7 +181,21 @@ struct VMStateField { struct VMStateDescription { const char *name; - int unmigratable; + bool unmigratable; + /* + * This VMSD describes something that should be sent during setup phase + * of migration. It plays similar role as save_setup() for explicitly + * registered vmstate entries, so it can be seen as a way to describe + * save_setup() in VMSD structures. + * + * Note that for now, a SaveStateEntry cannot have a VMSD and + * operations (e.g., save_setup()) set at the same time. Consequently, + * save_setup() and a VMSD with early_setup set to true are mutually + * exclusive. For this reason, also early_setup VMSDs are migrated in a + * QEMU_VM_SECTION_FULL section, while save_setup() data is migrated in + * a QEMU_VM_SECTION_START section. + */ + bool early_setup; int version_id; int minimum_version_id; MigrationPriority priority; @@ -705,8 +722,9 @@ extern const VMStateInfo vmstate_info_qlist; * '_state' type * That the pointer is right at the start of _tmp_type. */ -#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) { \ +#define VMSTATE_WITH_TMP_TEST(_state, _test, _tmp_type, _vmsd) { \ .name = "tmp", \ + .field_exists = (_test), \ .size = sizeof(_tmp_type) + \ QEMU_BUILD_BUG_ON_ZERO(offsetof(_tmp_type, parent) != 0) + \ type_check_pointer(_state, \ @@ -715,6 +733,9 @@ extern const VMStateInfo vmstate_info_qlist; .info = &vmstate_info_tmp, \ } +#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) \ + VMSTATE_WITH_TMP_TEST(_state, NULL, _tmp_type, _vmsd) + #define VMSTATE_UNUSED_BUFFER(_test, _version, _size) { \ .name = "unused", \ .field_exists = (_test), \ @@ -738,8 +759,9 @@ extern const VMStateInfo vmstate_info_qlist; /* _field_size should be a int32_t field in the _state struct giving the * size of the bitmap _field in bits. */ -#define VMSTATE_BITMAP(_field, _state, _version, _field_size) { \ +#define VMSTATE_BITMAP_TEST(_field, _state, _test, _version, _field_size) { \ .name = (stringify(_field)), \ + .field_exists = (_test), \ .version_id = (_version), \ .size_offset = vmstate_offset_value(_state, _field_size, int32_t),\ .info = &vmstate_info_bitmap, \ @@ -747,6 +769,9 @@ extern const VMStateInfo vmstate_info_qlist; .offset = offsetof(_state, _field), \ } +#define VMSTATE_BITMAP(_field, _state, _version, _field_size) \ + VMSTATE_BITMAP_TEST(_field, _state, NULL, _version, _field_size) + /* For migrating a QTAILQ. * Target QTAILQ needs be properly initialized. * _type: type of QTAILQ element @@ -1161,7 +1186,9 @@ extern const VMStateInfo vmstate_info_qlist; VMSTATE_UNUSED_BUFFER(_test, 0, _size) #define VMSTATE_END_OF_LIST() \ - {} + { \ + .flags = VMS_END, \ + } int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, int version_id); diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h index 6b74f92792..d764496f0b 100644 --- a/include/qemu/userfaultfd.h +++ b/include/qemu/userfaultfd.h @@ -13,10 +13,20 @@ #ifndef USERFAULTFD_H #define USERFAULTFD_H +#ifdef CONFIG_LINUX + #include "qemu/osdep.h" #include "exec/hwaddr.h" #include +/** + * uffd_open(): Open an userfaultfd handle for current context. + * + * @flags: The flags we want to pass in when creating the handle. + * + * Returns: the uffd handle if >=0, or <0 if error happens. + */ +int uffd_open(int flags); int uffd_query_features(uint64_t *features); int uffd_create_fd(uint64_t features, bool non_blocking); void uffd_close_fd(int uffd_fd); @@ -32,4 +42,6 @@ int uffd_wakeup(int uffd_fd, void *addr, uint64_t length); int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count); bool uffd_poll_events(int uffd_fd, int tmo); +#endif /* CONFIG_LINUX */ + #endif /* USERFAULTFD_H */ diff --git a/io/channel-buffer.c b/io/channel-buffer.c index bf52011be2..8096180f85 100644 --- a/io/channel-buffer.c +++ b/io/channel-buffer.c @@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc); diff --git a/io/channel-command.c b/io/channel-command.c index 74516252ba..e7edd091af 100644 --- a/io/channel-command.c +++ b/io/channel-command.c @@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc); diff --git a/io/channel-file.c b/io/channel-file.c index b67687c2aa..d76663e6ae 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); diff --git a/io/channel-null.c b/io/channel-null.c index 75e3781507..4fafdb770d 100644 --- a/io/channel-null.c +++ b/io/channel-null.c @@ -60,6 +60,7 @@ qio_channel_null_readv(QIOChannel *ioc, size_t niov, int **fds G_GNUC_UNUSED, size_t *nfds G_GNUC_UNUSED, + int flags, Error **errp) { QIOChannelNull *nioc = QIO_CHANNEL_NULL(ioc); diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..7aca84f61a 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -173,6 +173,9 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc, } #endif + qio_channel_set_feature(QIO_CHANNEL(ioc), + QIO_CHANNEL_FEATURE_READ_MSG_PEEK); + return 0; } @@ -406,6 +409,9 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, } #endif /* WIN32 */ + qio_channel_set_feature(QIO_CHANNEL(cioc), + QIO_CHANNEL_FEATURE_READ_MSG_PEEK); + trace_qio_channel_socket_accept_complete(ioc, cioc, cioc->fd); return cioc; @@ -496,6 +502,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); @@ -517,6 +524,10 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc, } + if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) { + sflags |= MSG_PEEK; + } + retry: ret = recvmsg(sioc->fd, &msg, sflags); if (ret < 0) { @@ -624,11 +635,17 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); ssize_t done = 0; ssize_t i; + int sflags = 0; + + if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) { + sflags |= MSG_PEEK; + } for (i = 0; i < niov; i++) { ssize_t ret; @@ -636,7 +653,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc, ret = recv(sioc->fd, iov[i].iov_base, iov[i].iov_len, - 0); + sflags); if (ret < 0) { if (errno == EAGAIN) { if (done) { diff --git a/io/channel-tls.c b/io/channel-tls.c index 4ce890a538..c730cb8ec5 100644 --- a/io/channel-tls.c +++ b/io/channel-tls.c @@ -260,6 +260,7 @@ static ssize_t qio_channel_tls_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc); diff --git a/io/channel-websock.c b/io/channel-websock.c index fb4932ade7..a12acc27cf 100644 --- a/io/channel-websock.c +++ b/io/channel-websock.c @@ -1081,6 +1081,7 @@ static ssize_t qio_channel_websock_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc); diff --git a/io/channel.c b/io/channel.c index 0640941ac5..a8c7f11649 100644 --- a/io/channel.c +++ b/io/channel.c @@ -52,6 +52,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); @@ -63,7 +64,14 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, return -1; } - return klass->io_readv(ioc, iov, niov, fds, nfds, errp); + if ((flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) && + !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { + error_setg_errno(errp, EINVAL, + "Channel does not support peek read"); + return -1; + } + + return klass->io_readv(ioc, iov, niov, fds, nfds, flags, errp); } @@ -146,7 +154,7 @@ int qio_channel_readv_full_all_eof(QIOChannel *ioc, while ((nlocal_iov > 0) || local_fds) { ssize_t len; len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds, - local_nfds, errp); + local_nfds, 0, errp); if (len == QIO_CHANNEL_ERR_BLOCK) { if (qemu_in_coroutine()) { qio_channel_yield(ioc, G_IO_IN); @@ -284,7 +292,7 @@ ssize_t qio_channel_readv(QIOChannel *ioc, size_t niov, Error **errp) { - return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, errp); + return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, 0, errp); } @@ -303,7 +311,7 @@ ssize_t qio_channel_read(QIOChannel *ioc, Error **errp) { struct iovec iov = { .iov_base = buf, .iov_len = buflen }; - return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, errp); + return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, 0, errp); } diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index 15127d489a..5a621419d3 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -762,11 +762,10 @@ static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque) return 0; } -static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque, - uint64_t max_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only) +static void dirty_bitmap_state_pending(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) { DBMSaveState *s = &((DBMState *)opaque)->save; SaveBitmapState *dbms; @@ -784,7 +783,7 @@ static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque, qemu_mutex_unlock_iothread(); - trace_dirty_bitmap_save_pending(pending, max_size); + trace_dirty_bitmap_state_pending(pending); *res_postcopy_only += pending; } @@ -1253,7 +1252,8 @@ static SaveVMHandlers savevm_dirty_bitmap_handlers = { .save_live_complete_postcopy = dirty_bitmap_save_complete, .save_live_complete_precopy = dirty_bitmap_save_complete, .has_postcopy = dirty_bitmap_has_postcopy, - .save_live_pending = dirty_bitmap_save_pending, + .state_pending_exact = dirty_bitmap_state_pending, + .state_pending_estimate = dirty_bitmap_state_pending, .save_live_iterate = dirty_bitmap_save_iterate, .is_active_iterate = dirty_bitmap_is_active_iterate, .load_state = dirty_bitmap_load, diff --git a/migration/block.c b/migration/block.c index 5da15a62de..29f69025af 100644 --- a/migration/block.c +++ b/migration/block.c @@ -863,10 +863,10 @@ static int block_save_complete(QEMUFile *f, void *opaque) return 0; } -static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only) +static void block_state_pending(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) { /* Estimate pending number of bytes to send */ uint64_t pending; @@ -885,7 +885,7 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, pending = BLK_MIG_BLOCK_SIZE; } - trace_migration_block_save_pending(pending); + trace_migration_block_state_pending(pending); /* We don't do postcopy */ *res_precopy_only += pending; } @@ -1020,7 +1020,8 @@ static SaveVMHandlers savevm_block_handlers = { .save_setup = block_save_setup, .save_live_iterate = block_save_iterate, .save_live_complete_precopy = block_save_complete, - .save_live_pending = block_save_pending, + .state_pending_exact = block_state_pending, + .state_pending_estimate = block_state_pending, .load_state = block_load, .save_cleanup = block_migration_cleanup, .is_active = block_is_active, diff --git a/migration/channel-block.c b/migration/channel-block.c index f4ab53acdb..b7374363c3 100644 --- a/migration/channel-block.c +++ b/migration/channel-block.c @@ -53,6 +53,7 @@ qio_channel_block_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc); diff --git a/migration/channel.c b/migration/channel.c index 1b0815039f..ca3319a309 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s, migrate_fd_connect(s, error); error_free(error); } + + +/** + * @migration_channel_read_peek - Peek at migration channel, without + * actually removing it from channel buffer. + * + * @ioc: the channel object + * @buf: the memory region to read data into + * @buflen: the number of bytes to read in @buf + * @errp: pointer to a NULL-initialized error object + * + * Returns 0 if successful, returns -1 and sets @errp if fails. + */ +int migration_channel_read_peek(QIOChannel *ioc, + const char *buf, + const size_t buflen, + Error **errp) +{ + ssize_t len = 0; + struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; + + while (true) { + len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, + QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { + error_setg(errp, + "Failed to peek at channel"); + return -1; + } + + if (len == buflen) { + break; + } + + /* 1ms sleep. */ + if (qemu_in_coroutine()) { + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000); + } else { + g_usleep(1000); + } + } + + return 0; +} diff --git a/migration/channel.h b/migration/channel.h index 67a461c28a..5bdb8208a7 100644 --- a/migration/channel.h +++ b/migration/channel.h @@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s, QIOChannel *ioc, const char *hostname, Error *error_in); + +int migration_channel_read_peek(QIOChannel *ioc, + const char *buf, + const size_t buflen, + Error **errp); #endif diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c index 4bfb97fc68..575d48c397 100644 --- a/migration/dirtyrate.c +++ b/migration/dirtyrate.c @@ -714,8 +714,8 @@ void qmp_calc_dirty_rate(int64_t calc_time, mode = DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING; } - if (has_sample_pages && mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) { - error_setg(errp, "either sample-pages or dirty-ring can be specified."); + if (has_sample_pages && mode != DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) { + error_setg(errp, "sample-pages is used only in page-sampling mode"); return; } @@ -785,8 +785,10 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict) DirtyRateStatus_str(info->status)); monitor_printf(mon, "Start Time: %"PRIi64" (ms)\n", info->start_time); - monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n", - info->sample_pages); + if (info->mode == DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) { + monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n", + info->sample_pages); + } monitor_printf(mon, "Period: %"PRIi64" (sec)\n", info->calc_time); monitor_printf(mon, "Mode: %s\n", diff --git a/migration/meson.build b/migration/meson.build index a9e7e18793..0d1bb9f96e 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -26,6 +26,7 @@ softmmu_ss.add(files( 'savevm.c', 'socket.c', 'tls.c', + 'threadinfo.c', ), gnutls) softmmu_ss.add(when: rdma, if_true: files('rdma.c')) diff --git a/migration/migration.c b/migration/migration.c index 56859d5869..7a14aa98d8 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -31,6 +31,7 @@ #include "migration.h" #include "savevm.h" #include "qemu-file.h" +#include "channel.h" #include "migration/vmstate.h" #include "block/block.h" #include "qapi/error.h" @@ -57,6 +58,7 @@ #include "net/announce.h" #include "qemu/queue.h" #include "multifd.h" +#include "threadinfo.h" #include "qemu/yank.h" #include "sysemu/cpus.h" #include "yank_functions.h" @@ -664,10 +666,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); - if (multifd_load_setup(errp) != 0) { - return false; - } - if (!mis->from_src_file) { mis->from_src_file = f; } @@ -734,31 +732,56 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; - bool start_migration; QEMUFile *f; + bool default_channel = true; + uint32_t channel_magic = 0; + int ret = 0; - if (!mis->from_src_file) { - /* The first connection (multifd may have multiple) */ + if (migrate_use_multifd() && !migrate_postcopy_ram() && + qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { + /* + * With multiple channels, it is possible that we receive channels + * out of order on destination side, causing incorrect mapping of + * source channels on destination side. Check channel MAGIC to + * decide type of channel. Please note this is best effort, postcopy + * preempt channel does not send any magic number so avoid it for + * postcopy live migration. Also tls live migration already does + * tls handshake while initializing main channel so with tls this + * issue is not possible. + */ + ret = migration_channel_read_peek(ioc, (void *)&channel_magic, + sizeof(channel_magic), &local_err); + + if (ret != 0) { + error_propagate(errp, local_err); + return; + } + + default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); + } else { + default_channel = !mis->from_src_file; + } + + if (multifd_load_setup(errp) != 0) { + error_setg(errp, "Failed to setup multifd channels"); + return; + } + + if (default_channel) { f = qemu_file_new_input(ioc); if (!migration_incoming_setup(f, errp)) { return; } - - /* - * Common migration only needs one channel, so we can start - * right now. Some features need more than one channel, we wait. - */ - start_migration = !migration_needs_multiple_sockets(); } else { /* Multiple connections */ assert(migration_needs_multiple_sockets()); if (migrate_use_multifd()) { - start_migration = multifd_recv_new_channel(ioc, &local_err); + multifd_recv_new_channel(ioc, &local_err); } else { assert(migrate_postcopy_preempt()); f = qemu_file_new_input(ioc); - start_migration = postcopy_preempt_new_channel(mis, f); + postcopy_preempt_new_channel(mis, f); } if (local_err) { error_propagate(errp, local_err); @@ -766,7 +789,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) } } - if (start_migration) { + if (migration_has_all_channels()) { /* If it's a recovery, we're done */ if (postcopy_try_recover()) { return; @@ -1051,20 +1074,30 @@ bool migration_is_running(int state) } } +static bool migrate_show_downtime(MigrationState *s) +{ + return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy(); +} + static void populate_time_info(MigrationInfo *info, MigrationState *s) { info->has_status = true; info->has_setup_time = true; info->setup_time = s->setup_time; + if (s->state == MIGRATION_STATUS_COMPLETED) { info->has_total_time = true; info->total_time = s->total_time; - info->has_downtime = true; - info->downtime = s->downtime; } else { info->has_total_time = true; info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->start_time; + } + + if (migrate_show_downtime(s)) { + info->has_downtime = true; + info->downtime = s->downtime; + } else { info->has_expected_downtime = true; info->expected_downtime = s->expected_downtime; } @@ -1933,6 +1966,8 @@ static void migrate_fd_cleanup(MigrationState *s) g_free(s->hostname); s->hostname = NULL; + json_writer_free(s->vmdesc); + s->vmdesc = NULL; qemu_savevm_state_cleanup(); @@ -2124,6 +2159,13 @@ bool migration_in_incoming_postcopy(void) return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END; } +bool migration_incoming_postcopy_advised(void) +{ + PostcopyState ps = postcopy_state_get(); + + return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; +} + bool migration_in_bg_snapshot(void) { MigrationState *s = migrate_get_current(); @@ -3778,33 +3820,39 @@ typedef enum { */ static MigIterateState migration_iteration_run(MigrationState *s) { - uint64_t pending_size, pend_pre, pend_compat, pend_post; + uint64_t pend_pre, pend_compat, pend_post; bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; - qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre, - &pend_compat, &pend_post); - pending_size = pend_pre + pend_compat + pend_post; + qemu_savevm_state_pending_estimate(&pend_pre, &pend_compat, &pend_post); + uint64_t pending_size = pend_pre + pend_compat + pend_post; - trace_migrate_pending(pending_size, s->threshold_size, - pend_pre, pend_compat, pend_post); + trace_migrate_pending_estimate(pending_size, + pend_pre, pend_compat, pend_post); - if (pending_size && pending_size >= s->threshold_size) { - /* Still a significant amount to transfer */ - if (!in_postcopy && pend_pre <= s->threshold_size && - qatomic_read(&s->start_postcopy)) { - if (postcopy_start(s)) { - error_report("%s: postcopy failed to start", __func__); - } - return MIG_ITERATE_SKIP; - } - /* Just another iteration step */ - qemu_savevm_state_iterate(s->to_dst_file, in_postcopy); - } else { + if (pend_pre + pend_compat <= s->threshold_size) { + qemu_savevm_state_pending_exact(&pend_pre, &pend_compat, &pend_post); + pending_size = pend_pre + pend_compat + pend_post; + trace_migrate_pending_exact(pending_size, + pend_pre, pend_compat, pend_post); + } + + if (!pending_size || pending_size < s->threshold_size) { trace_migration_thread_low_pending(pending_size); migration_completion(s); return MIG_ITERATE_BREAK; } + /* Still a significant amount to transfer */ + if (!in_postcopy && pend_pre <= s->threshold_size && + qatomic_read(&s->start_postcopy)) { + if (postcopy_start(s)) { + error_report("%s: postcopy failed to start", __func__); + } + return MIG_ITERATE_SKIP; + } + + /* Just another iteration step */ + qemu_savevm_state_iterate(s->to_dst_file, in_postcopy); return MIG_ITERATE_RESUME; } @@ -3981,10 +4029,13 @@ static void qemu_savevm_wait_unplug(MigrationState *s, int old_state, static void *migration_thread(void *opaque) { MigrationState *s = opaque; + MigrationThread *thread = NULL; int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); MigThrError thr_error; bool urgent = false; + thread = MigrationThreadAdd("live_migration", qemu_get_thread_id()); + rcu_register_thread(); object_ref(OBJECT(s)); @@ -4061,6 +4112,7 @@ static void *migration_thread(void *opaque) migration_iteration_finish(s); object_unref(OBJECT(s)); rcu_unregister_thread(); + MigrationThreadDel(thread); return NULL; } diff --git a/migration/migration.h b/migration/migration.h index ae4ffd3454..66511ce532 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -17,6 +17,7 @@ #include "exec/cpu-common.h" #include "hw/qdev-core.h" #include "qapi/qapi-types-migration.h" +#include "qapi/qmp/json-writer.h" #include "qemu/thread.h" #include "qemu/coroutine_int.h" #include "io/channel.h" @@ -366,6 +367,9 @@ struct MigrationState { * This save hostname when out-going migration starts */ char *hostname; + + /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */ + JSONWriter *vmdesc; }; void migrate_set_state(int *state, int old_state, int new_state); diff --git a/migration/multifd.c b/migration/multifd.c index 000ca4d4ec..b7ad7002e0 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -24,6 +24,7 @@ #include "qemu-file.h" #include "trace.h" #include "multifd.h" +#include "threadinfo.h" #include "qemu/yank.h" #include "io/channel-socket.h" @@ -442,6 +443,7 @@ static int multifd_send_pages(QEMUFile *f) int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) { MultiFDPages_t *pages = multifd_send_state->pages; + bool changed = false; if (!pages->block) { pages->block = block; @@ -454,14 +456,16 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) if (pages->num < pages->allocated) { return 1; } + } else { + changed = true; } if (multifd_send_pages(f) < 0) { return -1; } - if (pages->block != block) { - return multifd_queue_page(f, block, offset); + if (changed) { + return multifd_queue_page(f, block, offset); } return 1; @@ -627,16 +631,16 @@ int multifd_send_sync_main(QEMUFile *f) stat64_add(&ram_atomic_counters.transferred, p->packet_len); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); - - if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) { - return -1; - } } for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); + + if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) { + return -1; + } } trace_multifd_send_sync_main(multifd_send_state->packet_num); @@ -646,10 +650,13 @@ int multifd_send_sync_main(QEMUFile *f) static void *multifd_send_thread(void *opaque) { MultiFDSendParams *p = opaque; + MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; bool use_zero_copy_send = migrate_use_zero_copy_send(); + thread = MigrationThreadAdd(p->name, qemu_get_thread_id()); + trace_multifd_send_thread_start(p->id); rcu_register_thread(); @@ -759,6 +766,7 @@ out: qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); + MigrationThreadDel(thread); trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); return NULL; @@ -1164,9 +1172,14 @@ int multifd_load_setup(Error **errp) uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); uint8_t i; - if (!migrate_use_multifd()) { + /* + * Return successfully if multiFD recv state is already initialised + * or multiFD is not enabled. + */ + if (multifd_recv_state || !migrate_use_multifd()) { return 0; } + if (!migrate_multi_channels_is_allowed()) { error_setg(errp, "multifd is not supported by current protocol"); return -1; @@ -1227,11 +1240,9 @@ bool multifd_recv_all_channels_created(void) /* * Try to receive all multifd channels to get ready for the migration. - * - Return true and do not set @errp when correctly receiving all channels; - * - Return false and do not set @errp when correctly receiving the current one; - * - Return false and set @errp when failing to receive the current channel. + * Sets @errp when failing to receive the current channel. */ -bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp) +void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) { MultiFDRecvParams *p; Error *local_err = NULL; @@ -1244,7 +1255,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp) "failed to receive packet" " via multifd channel %d: ", qatomic_read(&multifd_recv_state->count)); - return false; + return; } trace_multifd_recv_new_channel(id); @@ -1254,7 +1265,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp) id); multifd_recv_terminate_threads(local_err); error_propagate(errp, local_err); - return false; + return; } p->c = ioc; object_ref(OBJECT(ioc)); @@ -1265,6 +1276,4 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp) qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); - return qatomic_read(&multifd_recv_state->count) == - migrate_multifd_channels(); } diff --git a/migration/multifd.c.orig b/migration/multifd.c.orig new file mode 100644 index 0000000000..ad89293b4e --- /dev/null +++ b/migration/multifd.c.orig @@ -0,0 +1,1274 @@ +/* + * Multifd common code + * + * Copyright (c) 2019-2020 Red Hat Inc + * + * Authors: + * Juan Quintela + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/rcu.h" +#include "exec/target_page.h" +#include "sysemu/sysemu.h" +#include "exec/ramblock.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "ram.h" +#include "migration.h" +#include "socket.h" +#include "tls.h" +#include "qemu-file.h" +#include "trace.h" +#include "multifd.h" + +#include "qemu/yank.h" +#include "io/channel-socket.h" +#include "yank_functions.h" + +/* Multiple fd's */ + +#define MULTIFD_MAGIC 0x11223344U +#define MULTIFD_VERSION 1 + +typedef struct { + uint32_t magic; + uint32_t version; + unsigned char uuid[16]; /* QemuUUID */ + uint8_t id; + uint8_t unused1[7]; /* Reserved for future use */ + uint64_t unused2[4]; /* Reserved for future use */ +} __attribute__((packed)) MultiFDInit_t; + +/* Multifd without compression */ + +/** + * nocomp_send_setup: setup send side + * + * For no compression this function does nothing. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) +{ + return 0; +} + +/** + * nocomp_send_cleanup: cleanup send side + * + * For no compression this function does nothing. + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + return; +} + +/** + * nocomp_send_prepare: prepare date to be able to send + * + * For no compression we just have to calculate the size of the + * packet. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) +{ + MultiFDPages_t *pages = p->pages; + + for (int i = 0; i < p->normal_num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + + p->next_packet_size = p->normal_num * p->page_size; + p->flags |= MULTIFD_FLAG_NOCOMP; + return 0; +} + +/** + * nocomp_recv_setup: setup receive side + * + * For no compression this function does nothing. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + return 0; +} + +/** + * nocomp_recv_cleanup: setup receive side + * + * For no compression this function does nothing. + * + * @p: Params for the channel that we are using + */ +static void nocomp_recv_cleanup(MultiFDRecvParams *p) +{ +} + +/** + * nocomp_recv_pages: read the data from the channel into actual pages + * + * For no compression we just need to read things into the correct place. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) +{ + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (flags != MULTIFD_FLAG_NOCOMP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_NOCOMP); + return -1; + } + for (int i = 0; i < p->normal_num; i++) { + p->iov[i].iov_base = p->host + p->normal[i]; + p->iov[i].iov_len = p->page_size; + } + return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); +} + +static MultiFDMethods multifd_nocomp_ops = { + .send_setup = nocomp_send_setup, + .send_cleanup = nocomp_send_cleanup, + .send_prepare = nocomp_send_prepare, + .recv_setup = nocomp_recv_setup, + .recv_cleanup = nocomp_recv_cleanup, + .recv_pages = nocomp_recv_pages +}; + +static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { + [MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops, +}; + +void multifd_register_ops(int method, MultiFDMethods *ops) +{ + assert(0 < method && method < MULTIFD_COMPRESSION__MAX); + multifd_ops[method] = ops; +} + +static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) +{ + MultiFDInit_t msg = {}; + int ret; + + msg.magic = cpu_to_be32(MULTIFD_MAGIC); + msg.version = cpu_to_be32(MULTIFD_VERSION); + msg.id = p->id; + memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid)); + + ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp); + if (ret != 0) { + return -1; + } + return 0; +} + +static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) +{ + MultiFDInit_t msg; + int ret; + + ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp); + if (ret != 0) { + return -1; + } + + msg.magic = be32_to_cpu(msg.magic); + msg.version = be32_to_cpu(msg.version); + + if (msg.magic != MULTIFD_MAGIC) { + error_setg(errp, "multifd: received packet magic %x " + "expected %x", msg.magic, MULTIFD_MAGIC); + return -1; + } + + if (msg.version != MULTIFD_VERSION) { + error_setg(errp, "multifd: received packet version %u " + "expected %u", msg.version, MULTIFD_VERSION); + return -1; + } + + if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) { + char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid); + char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid); + + error_setg(errp, "multifd: received uuid '%s' and expected " + "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id); + g_free(uuid); + g_free(msg_uuid); + return -1; + } + + if (msg.id > migrate_multifd_channels()) { + error_setg(errp, "multifd: received channel version %u " + "expected %u", msg.version, MULTIFD_VERSION); + return -1; + } + + return msg.id; +} + +static MultiFDPages_t *multifd_pages_init(size_t size) +{ + MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); + + pages->allocated = size; + pages->offset = g_new0(ram_addr_t, size); + + return pages; +} + +static void multifd_pages_clear(MultiFDPages_t *pages) +{ + pages->num = 0; + pages->allocated = 0; + pages->packet_num = 0; + pages->block = NULL; + g_free(pages->offset); + pages->offset = NULL; + g_free(pages); +} + +static void multifd_send_fill_packet(MultiFDSendParams *p) +{ + MultiFDPacket_t *packet = p->packet; + int i; + + packet->flags = cpu_to_be32(p->flags); + packet->pages_alloc = cpu_to_be32(p->pages->allocated); + packet->normal_pages = cpu_to_be32(p->normal_num); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); + packet->packet_num = cpu_to_be64(p->packet_num); + + if (p->pages->block) { + strncpy(packet->ramblock, p->pages->block->idstr, 256); + } + + for (i = 0; i < p->normal_num; i++) { + /* there are architectures where ram_addr_t is 32 bit */ + uint64_t temp = p->normal[i]; + + packet->offset[i] = cpu_to_be64(temp); + } +} + +static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +{ + MultiFDPacket_t *packet = p->packet; + RAMBlock *block; + int i; + + packet->magic = be32_to_cpu(packet->magic); + if (packet->magic != MULTIFD_MAGIC) { + error_setg(errp, "multifd: received packet " + "magic %x and expected magic %x", + packet->magic, MULTIFD_MAGIC); + return -1; + } + + packet->version = be32_to_cpu(packet->version); + if (packet->version != MULTIFD_VERSION) { + error_setg(errp, "multifd: received packet " + "version %u and expected version %u", + packet->version, MULTIFD_VERSION); + return -1; + } + + p->flags = be32_to_cpu(packet->flags); + + packet->pages_alloc = be32_to_cpu(packet->pages_alloc); + /* + * If we received a packet that is 100 times bigger than expected + * just stop migration. It is a magic number. + */ + if (packet->pages_alloc > p->page_count) { + error_setg(errp, "multifd: received packet " + "with size %u and expected a size of %u", + packet->pages_alloc, p->page_count) ; + return -1; + } + + p->normal_num = be32_to_cpu(packet->normal_pages); + if (p->normal_num > packet->pages_alloc) { + error_setg(errp, "multifd: received packet " + "with %u pages and expected maximum pages are %u", + p->normal_num, packet->pages_alloc) ; + return -1; + } + + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + p->packet_num = be64_to_cpu(packet->packet_num); + + if (p->normal_num == 0) { + return 0; + } + + /* make sure that ramblock is 0 terminated */ + packet->ramblock[255] = 0; + block = qemu_ram_block_by_name(packet->ramblock); + if (!block) { + error_setg(errp, "multifd: unknown ram block %s", + packet->ramblock); + return -1; + } + + p->host = block->host; + for (i = 0; i < p->normal_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[i]); + + if (offset > (block->used_length - p->page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, block->used_length); + return -1; + } + p->normal[i] = offset; + } + + return 0; +} + +struct { + MultiFDSendParams *params; + /* array of pages to sent */ + MultiFDPages_t *pages; + /* global number of generated multifd packets */ + uint64_t packet_num; + /* send channels ready */ + QemuSemaphore channels_ready; + /* + * Have we already run terminate threads. There is a race when it + * happens that we got one error while we are exiting. + * We will use atomic operations. Only valid values are 0 and 1. + */ + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_send_state; + +/* + * How we use multifd_send_state->pages and channel->pages? + * + * We create a pages for each channel, and a main one. Each time that + * we need to send a batch of pages we interchange the ones between + * multifd_send_state and the channel that is sending it. There are + * two reasons for that: + * - to not have to do so many mallocs during migration + * - to make easier to know what to free at the end of migration + * + * This way we always know who is the owner of each "pages" struct, + * and we don't need any locking. It belongs to the migration thread + * or to the channel thread. Switching is safe because the migration + * thread is using the channel mutex when changing it, and the channel + * have to had finish with its own, otherwise pending_job can't be + * false. + */ + +static int multifd_send_pages(QEMUFile *f) +{ + int i; + static int next_channel; + MultiFDSendParams *p = NULL; /* make happy gcc */ + MultiFDPages_t *pages = multifd_send_state->pages; + uint64_t transferred; + + if (qatomic_read(&multifd_send_state->exiting)) { + return -1; + } + + qemu_sem_wait(&multifd_send_state->channels_ready); + /* + * next_channel can remain from a previous migration that was + * using more channels, so ensure it doesn't overflow if the + * limit is lower now. + */ + next_channel %= migrate_multifd_channels(); + for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { + p = &multifd_send_state->params[i]; + + qemu_mutex_lock(&p->mutex); + if (p->quit) { + error_report("%s: channel %d has already quit!", __func__, i); + qemu_mutex_unlock(&p->mutex); + return -1; + } + if (!p->pending_job) { + p->pending_job++; + next_channel = (i + 1) % migrate_multifd_channels(); + break; + } + qemu_mutex_unlock(&p->mutex); + } + assert(!p->pages->num); + assert(!p->pages->block); + + p->packet_num = multifd_send_state->packet_num++; + multifd_send_state->pages = p->pages; + p->pages = pages; + transferred = ((uint64_t) pages->num) * p->page_size + p->packet_len; + qemu_file_acct_rate_limit(f, transferred); + ram_counters.multifd_bytes += transferred; + stat64_add(&ram_atomic_counters.transferred, transferred); + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem); + + return 1; +} + +int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) +{ + MultiFDPages_t *pages = multifd_send_state->pages; + bool changed = false; + + if (!pages->block) { + pages->block = block; + } + + if (pages->block == block) { + pages->offset[pages->num] = offset; + pages->num++; + + if (pages->num < pages->allocated) { + return 1; + } + } else { + changed = true; + } + + if (multifd_send_pages(f) < 0) { + return -1; + } + + if (changed) { + return multifd_queue_page(f, block, offset); + } + + return 1; +} + +static void multifd_send_terminate_threads(Error *err) +{ + int i; + + trace_multifd_send_terminate_threads(err != NULL); + + if (err) { + MigrationState *s = migrate_get_current(); + migrate_set_error(s, err); + if (s->state == MIGRATION_STATUS_SETUP || + s->state == MIGRATION_STATUS_PRE_SWITCHOVER || + s->state == MIGRATION_STATUS_DEVICE || + s->state == MIGRATION_STATUS_ACTIVE) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } + } + + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two + * threads at the same time, we can end calling this function + * twice. + */ + if (qatomic_xchg(&multifd_send_state->exiting, 1)) { + return; + } + + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + qemu_mutex_lock(&p->mutex); + p->quit = true; + qemu_sem_post(&p->sem); + if (p->c) { + qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } + qemu_mutex_unlock(&p->mutex); + } +} + +void multifd_save_cleanup(void) +{ + int i; + + if (!migrate_use_multifd() || !migrate_multi_channels_is_allowed()) { + return; + } + multifd_send_terminate_threads(NULL); + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + if (p->running) { + qemu_thread_join(&p->thread); + } + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + Error *local_err = NULL; + + if (p->registered_yank) { + migration_ioc_unregister_yank(p->c); + } + socket_send_channel_destroy(p->c); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + multifd_pages_clear(p->pages); + p->pages = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + g_free(p->normal); + p->normal = NULL; + multifd_send_state->ops->send_cleanup(p, &local_err); + if (local_err) { + migrate_set_error(migrate_get_current(), local_err); + error_free(local_err); + } + } + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); + multifd_send_state->params = NULL; + multifd_pages_clear(multifd_send_state->pages); + multifd_send_state->pages = NULL; + g_free(multifd_send_state); + multifd_send_state = NULL; +} + +static int multifd_zero_copy_flush(QIOChannel *c) +{ + int ret; + Error *err = NULL; + + ret = qio_channel_flush(c, &err); + if (ret < 0) { + error_report_err(err); + return -1; + } + if (ret == 1) { + dirty_sync_missed_zero_copy(); + } + + return ret; +} + +int multifd_send_sync_main(QEMUFile *f) +{ + int i; + bool flush_zero_copy; + + if (!migrate_use_multifd()) { + return 0; + } + if (multifd_send_state->pages->num) { + if (multifd_send_pages(f) < 0) { + error_report("%s: multifd_send_pages fail", __func__); + return -1; + } + } + + /* + * When using zero-copy, it's necessary to flush the pages before any of + * the pages can be sent again, so we'll make sure the new version of the + * pages will always arrive _later_ than the old pages. + * + * Currently we achieve this by flushing the zero-page requested writes + * per ram iteration, but in the future we could potentially optimize it + * to be less frequent, e.g. only after we finished one whole scanning of + * all the dirty bitmaps. + */ + + flush_zero_copy = migrate_use_zero_copy_send(); + + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + trace_multifd_send_sync_main_signal(p->id); + + qemu_mutex_lock(&p->mutex); + + if (p->quit) { + error_report("%s: channel %d has already quit", __func__, i); + qemu_mutex_unlock(&p->mutex); + return -1; + } + + p->packet_num = multifd_send_state->packet_num++; + p->flags |= MULTIFD_FLAG_SYNC; + p->pending_job++; + qemu_file_acct_rate_limit(f, p->packet_len); + ram_counters.multifd_bytes += p->packet_len; + stat64_add(&ram_atomic_counters.transferred, p->packet_len); + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem); + + if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) { + return -1; + } + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + trace_multifd_send_sync_main_wait(p->id); + qemu_sem_wait(&p->sem_sync); + } + trace_multifd_send_sync_main(multifd_send_state->packet_num); + + return 0; +} + +static void *multifd_send_thread(void *opaque) +{ + MultiFDSendParams *p = opaque; + Error *local_err = NULL; + int ret = 0; + bool use_zero_copy_send = migrate_use_zero_copy_send(); + + trace_multifd_send_thread_start(p->id); + rcu_register_thread(); + + if (multifd_send_initial_packet(p, &local_err) < 0) { + ret = -1; + goto out; + } + /* initial packet */ + p->num_packets = 1; + + while (true) { + qemu_sem_wait(&p->sem); + + if (qatomic_read(&multifd_send_state->exiting)) { + break; + } + qemu_mutex_lock(&p->mutex); + + if (p->pending_job) { + uint64_t packet_num = p->packet_num; + uint32_t flags = p->flags; + p->normal_num = 0; + + if (use_zero_copy_send) { + p->iovs_num = 0; + } else { + p->iovs_num = 1; + } + + for (int i = 0; i < p->pages->num; i++) { + p->normal[p->normal_num] = p->pages->offset[i]; + p->normal_num++; + } + + if (p->normal_num) { + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; + } + } + multifd_send_fill_packet(p); + p->flags = 0; + p->num_packets++; + p->total_normal_pages += p->normal_num; + p->pages->num = 0; + p->pages->block = NULL; + qemu_mutex_unlock(&p->mutex); + + trace_multifd_send(p->id, packet_num, p->normal_num, flags, + p->next_packet_size); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + } else { + /* Send header using the same writev call */ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + } + + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, + 0, p->write_flags, &local_err); + if (ret != 0) { + break; + } + + qemu_mutex_lock(&p->mutex); + p->pending_job--; + qemu_mutex_unlock(&p->mutex); + + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&p->sem_sync); + } + qemu_sem_post(&multifd_send_state->channels_ready); + } else if (p->quit) { + qemu_mutex_unlock(&p->mutex); + break; + } else { + qemu_mutex_unlock(&p->mutex); + /* sometimes there are spurious wakeups */ + } + } + +out: + if (local_err) { + trace_multifd_send_error(p->id); + multifd_send_terminate_threads(local_err); + error_free(local_err); + } + + /* + * Error happen, I will exit, but I can't just leave, tell + * who pay attention to me. + */ + if (ret != 0) { + qemu_sem_post(&p->sem_sync); + qemu_sem_post(&multifd_send_state->channels_ready); + } + + qemu_mutex_lock(&p->mutex); + p->running = false; + qemu_mutex_unlock(&p->mutex); + + rcu_unregister_thread(); + trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); + + return NULL; +} + +static bool multifd_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error *error); + +static void multifd_tls_outgoing_handshake(QIOTask *task, + gpointer opaque) +{ + MultiFDSendParams *p = opaque; + QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); + Error *err = NULL; + + if (qio_task_propagate_error(task, &err)) { + trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + } else { + trace_multifd_tls_outgoing_handshake_complete(ioc); + } + + if (!multifd_channel_connect(p, ioc, err)) { + /* + * Error happen, mark multifd_send_thread status as 'quit' although it + * is not created, and then tell who pay attention to me. + */ + p->quit = true; + qemu_sem_post(&multifd_send_state->channels_ready); + qemu_sem_post(&p->sem_sync); + } +} + +static void *multifd_tls_handshake_thread(void *opaque) +{ + MultiFDSendParams *p = opaque; + QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); + + qio_channel_tls_handshake(tioc, + multifd_tls_outgoing_handshake, + p, + NULL, + NULL); + return NULL; +} + +static void multifd_tls_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error **errp) +{ + MigrationState *s = migrate_get_current(); + const char *hostname = s->hostname; + QIOChannelTLS *tioc; + + tioc = migration_tls_client_create(s, ioc, hostname, errp); + if (!tioc) { + return; + } + + object_unref(OBJECT(ioc)); + trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); + qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); + p->c = QIO_CHANNEL(tioc); + qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", + multifd_tls_handshake_thread, p, + QEMU_THREAD_JOINABLE); +} + +static bool multifd_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error *error) +{ + trace_multifd_set_outgoing_channel( + ioc, object_get_typename(OBJECT(ioc)), + migrate_get_current()->hostname, error); + + if (!error) { + if (migrate_channel_requires_tls_upgrade(ioc)) { + multifd_tls_channel_connect(p, ioc, &error); + if (!error) { + /* + * tls_channel_connect will call back to this + * function after the TLS handshake, + * so we mustn't call multifd_send_thread until then + */ + return true; + } else { + return false; + } + } else { + migration_ioc_register_yank(ioc); + p->registered_yank = true; + p->c = ioc; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); + } + return true; + } + + return false; +} + +static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, + QIOChannel *ioc, Error *err) +{ + migrate_set_error(migrate_get_current(), err); + /* Error happen, we need to tell who pay attention to me */ + qemu_sem_post(&multifd_send_state->channels_ready); + qemu_sem_post(&p->sem_sync); + /* + * Although multifd_send_thread is not created, but main migration + * thread neet to judge whether it is running, so we need to mark + * its status. + */ + p->quit = true; + object_unref(OBJECT(ioc)); + error_free(err); +} + +static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) +{ + MultiFDSendParams *p = opaque; + QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task)); + Error *local_err = NULL; + + trace_multifd_new_send_channel_async(p->id); + if (qio_task_propagate_error(task, &local_err)) { + goto cleanup; + } else { + p->c = QIO_CHANNEL(sioc); + qio_channel_set_delay(p->c, false); + p->running = true; + if (!multifd_channel_connect(p, sioc, local_err)) { + goto cleanup; + } + return; + } + +cleanup: + multifd_new_send_channel_cleanup(p, sioc, local_err); +} + +int multifd_save_setup(Error **errp) +{ + int thread_count; + uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + uint8_t i; + + if (!migrate_use_multifd()) { + return 0; + } + if (!migrate_multi_channels_is_allowed()) { + error_setg(errp, "multifd is not supported by current protocol"); + return -1; + } + + thread_count = migrate_multifd_channels(); + multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); + multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); + multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_ready, 0); + qatomic_set(&multifd_send_state->exiting, 0); + multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; + + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + qemu_mutex_init(&p->mutex); + qemu_sem_init(&p->sem, 0); + qemu_sem_init(&p->sem_sync, 0); + p->quit = false; + p->pending_job = 0; + p->id = i; + p->pages = multifd_pages_init(page_count); + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + p->name = g_strdup_printf("multifdsend_%d", i); + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + p->normal = g_new0(ram_addr_t, page_count); + p->page_size = qemu_target_page_size(); + p->page_count = page_count; + + if (migrate_use_zero_copy_send()) { + p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } else { + p->write_flags = 0; + } + + socket_send_channel_create(multifd_new_send_channel_async, p); + } + + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + Error *local_err = NULL; + int ret; + + ret = multifd_send_state->ops->send_setup(p, &local_err); + if (ret) { + error_propagate(errp, local_err); + return ret; + } + } + return 0; +} + +struct { + MultiFDRecvParams *params; + /* number of created threads */ + int count; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + /* global number of generated multifd packets */ + uint64_t packet_num; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_recv_state; + +static void multifd_recv_terminate_threads(Error *err) +{ + int i; + + trace_multifd_recv_terminate_threads(err != NULL); + + if (err) { + MigrationState *s = migrate_get_current(); + migrate_set_error(s, err); + if (s->state == MIGRATION_STATUS_SETUP || + s->state == MIGRATION_STATUS_ACTIVE) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } + } + + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + qemu_mutex_lock(&p->mutex); + p->quit = true; + /* + * We could arrive here for two reasons: + * - normal quit, i.e. everything went fine, just finished + * - error quit: We close the channels so the channel threads + * finish the qio_channel_read_all_eof() + */ + if (p->c) { + qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } + qemu_mutex_unlock(&p->mutex); + } +} + +int multifd_load_cleanup(Error **errp) +{ + int i; + + if (!migrate_use_multifd() || !migrate_multi_channels_is_allowed()) { + return 0; + } + multifd_recv_terminate_threads(NULL); + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + if (p->running) { + p->quit = true; + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); + qemu_thread_join(&p->thread); + } + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + migration_ioc_unregister_yank(p->c); + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + g_free(p->normal); + p->normal = NULL; + multifd_recv_state->ops->recv_cleanup(p); + } + qemu_sem_destroy(&multifd_recv_state->sem_sync); + g_free(multifd_recv_state->params); + multifd_recv_state->params = NULL; + g_free(multifd_recv_state); + multifd_recv_state = NULL; + + return 0; +} + +void multifd_recv_sync_main(void) +{ + int i; + + if (!migrate_use_multifd()) { + return; + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + trace_multifd_recv_sync_main_wait(p->id); + qemu_sem_wait(&multifd_recv_state->sem_sync); + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + WITH_QEMU_LOCK_GUARD(&p->mutex) { + if (multifd_recv_state->packet_num < p->packet_num) { + multifd_recv_state->packet_num = p->packet_num; + } + } + trace_multifd_recv_sync_main_signal(p->id); + qemu_sem_post(&p->sem_sync); + } + trace_multifd_recv_sync_main(multifd_recv_state->packet_num); +} + +static void *multifd_recv_thread(void *opaque) +{ + MultiFDRecvParams *p = opaque; + Error *local_err = NULL; + int ret; + + trace_multifd_recv_thread_start(p->id); + rcu_register_thread(); + + while (true) { + uint32_t flags; + + if (p->quit) { + break; + } + + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret == 0) { /* EOF */ + break; + } + if (ret == -1) { /* Error */ + break; + } + + qemu_mutex_lock(&p->mutex); + ret = multifd_recv_unfill_packet(p, &local_err); + if (ret) { + qemu_mutex_unlock(&p->mutex); + break; + } + + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; + trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, + p->next_packet_size); + p->num_packets++; + p->total_normal_pages += p->normal_num; + qemu_mutex_unlock(&p->mutex); + + if (p->normal_num) { + ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (ret != 0) { + break; + } + } + + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&multifd_recv_state->sem_sync); + qemu_sem_wait(&p->sem_sync); + } + } + + if (local_err) { + multifd_recv_terminate_threads(local_err); + error_free(local_err); + } + qemu_mutex_lock(&p->mutex); + p->running = false; + qemu_mutex_unlock(&p->mutex); + + rcu_unregister_thread(); + trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages); + + return NULL; +} + +int multifd_load_setup(Error **errp) +{ + int thread_count; + uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + uint8_t i; + + /* + * Return successfully if multiFD recv state is already initialised + * or multiFD is not enabled. + */ + if (multifd_recv_state || !migrate_use_multifd()) { + return 0; + } + + if (!migrate_multi_channels_is_allowed()) { + error_setg(errp, "multifd is not supported by current protocol"); + return -1; + } + thread_count = migrate_multifd_channels(); + multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); + multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); + qatomic_set(&multifd_recv_state->count, 0); + qemu_sem_init(&multifd_recv_state->sem_sync, 0); + multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()]; + + for (i = 0; i < thread_count; i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + qemu_mutex_init(&p->mutex); + qemu_sem_init(&p->sem_sync, 0); + p->quit = false; + p->id = i; + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->name = g_strdup_printf("multifdrecv_%d", i); + p->iov = g_new0(struct iovec, page_count); + p->normal = g_new0(ram_addr_t, page_count); + p->page_count = page_count; + p->page_size = qemu_target_page_size(); + } + + for (i = 0; i < thread_count; i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + Error *local_err = NULL; + int ret; + + ret = multifd_recv_state->ops->recv_setup(p, &local_err); + if (ret) { + error_propagate(errp, local_err); + return ret; + } + } + return 0; +} + +bool multifd_recv_all_channels_created(void) +{ + int thread_count = migrate_multifd_channels(); + + if (!migrate_use_multifd()) { + return true; + } + + if (!multifd_recv_state) { + /* Called before any connections created */ + return false; + } + + return thread_count == qatomic_read(&multifd_recv_state->count); +} + +/* + * Try to receive all multifd channels to get ready for the migration. + * Sets @errp when failing to receive the current channel. + */ +void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) +{ + MultiFDRecvParams *p; + Error *local_err = NULL; + int id; + + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + error_propagate_prepend(errp, local_err, + "failed to receive packet" + " via multifd channel %d: ", + qatomic_read(&multifd_recv_state->count)); + return; + } + trace_multifd_recv_new_channel(id); + + p = &multifd_recv_state->params[id]; + if (p->c != NULL) { + error_setg(&local_err, "multifd: received id '%d' already setup'", + id); + multifd_recv_terminate_threads(local_err); + error_propagate(errp, local_err); + return; + } + p->c = ioc; + object_ref(OBJECT(ioc)); + /* initial packet */ + p->num_packets = 1; + + p->running = true; + qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, + QEMU_THREAD_JOINABLE); + qatomic_inc(&multifd_recv_state->count); +} diff --git a/migration/multifd.h b/migration/multifd.h index e2802a9ce2..ff3aa2e2e9 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -18,7 +18,7 @@ void multifd_save_cleanup(void); int multifd_load_setup(Error **errp); int multifd_load_cleanup(Error **errp); bool multifd_recv_all_channels_created(void); -bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp); +void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(QEMUFile *f); int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset); diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index b9a37ef255..b98e95dab0 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -37,6 +37,7 @@ #include "qemu-file.h" #include "yank_functions.h" #include "tls.h" +#include "qemu/userfaultfd.h" /* Arbitrary limit on size of each discard command, * keeps them around ~200 bytes @@ -226,11 +227,9 @@ static bool receive_ufd_features(uint64_t *features) int ufd; bool ret = true; - /* if we are here __NR_userfaultfd should exists */ - ufd = syscall(__NR_userfaultfd, O_CLOEXEC); + ufd = uffd_open(O_CLOEXEC); if (ufd == -1) { - error_report("%s: syscall __NR_userfaultfd failed: %s", __func__, - strerror(errno)); + error_report("%s: uffd_open() failed: %s", __func__, strerror(errno)); return false; } @@ -375,7 +374,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) goto out; } - ufd = syscall(__NR_userfaultfd, O_CLOEXEC); + ufd = uffd_open(O_CLOEXEC); if (ufd == -1) { error_report("%s: userfaultfd not available: %s", __func__, strerror(errno)); @@ -1160,7 +1159,7 @@ static int postcopy_temp_pages_setup(MigrationIncomingState *mis) int postcopy_ram_incoming_setup(MigrationIncomingState *mis) { /* Open the fd for the kernel to give us userfaults */ - mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK); if (mis->userfault_fd == -1) { error_report("%s: Failed to open userfault fd: %s", __func__, strerror(errno)); @@ -1539,7 +1538,7 @@ void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd) } } -bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file) +void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file) { /* * The new loading channel has its own threads, so it needs to be @@ -1548,9 +1547,6 @@ bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file) qemu_file_set_blocking(file, true); mis->postcopy_qemufile_dst = file; trace_postcopy_preempt_new_channel(); - - /* Start the migration immediately */ - return true; } /* diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h index 6147bf7d1d..25881c4127 100644 --- a/migration/postcopy-ram.h +++ b/migration/postcopy-ram.h @@ -190,7 +190,7 @@ enum PostcopyChannels { RAM_CHANNEL_MAX, }; -bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file); +void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file); int postcopy_preempt_setup(MigrationState *s, Error **errp); int postcopy_preempt_wait_channel(MigrationState *s); diff --git a/migration/ram.c b/migration/ram.c index 334309f1c6..b966e148c2 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1774,13 +1774,15 @@ out: static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, ram_addr_t size) { + const ram_addr_t end = offset + size; + /* * We read one byte of each page; this will preallocate page tables if * required and populate the shared zeropage on MAP_PRIVATE anonymous memory * where no page was populated yet. This might require adaption when * supporting other mappings, like shmem. */ - for (; offset < size; offset += block->page_size) { + for (; offset < end; offset += block->page_size) { char tmp = *((char *)block->host + offset); /* Don't optimize the read out */ @@ -1863,6 +1865,39 @@ void ram_write_tracking_prepare(void) } } +static inline int uffd_protect_section(MemoryRegionSection *section, + void *opaque) +{ + const hwaddr size = int128_get64(section->size); + const hwaddr offset = section->offset_within_region; + RAMBlock *rb = section->mr->ram_block; + int uffd_fd = (uintptr_t)opaque; + + return uffd_change_protection(uffd_fd, rb->host + offset, size, true, + false); +} + +static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) +{ + assert(rb->flags & RAM_UF_WRITEPROTECT); + + /* See ram_block_populate_read() */ + if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); + MemoryRegionSection section = { + .mr = rb->mr, + .offset_within_region = 0, + .size = rb->mr->size, + }; + + return ram_discard_manager_replay_populated(rdm, §ion, + uffd_protect_section, + (void *)(uintptr_t)uffd_fd); + } + return uffd_change_protection(uffd_fd, rb->host, + rb->used_length, true, false); +} + /* * ram_write_tracking_start: start UFFD-WP memory tracking * @@ -1894,14 +1929,14 @@ int ram_write_tracking_start(void) block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { goto fail; } - /* Apply UFFD write protection to the block memory range */ - if (uffd_change_protection(rs->uffdio_fd, block->host, - block->max_length, true, false)) { - goto fail; - } block->flags |= RAM_UF_WRITEPROTECT; memory_region_ref(block->mr); + /* Apply UFFD write protection to the block memory range */ + if (ram_block_uffd_protect(block, uffd_fd)) { + goto fail; + } + trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, block->host, block->max_length); } @@ -1915,12 +1950,6 @@ fail: if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { continue; } - /* - * In case some memory block failed to be write-protected - * remove protection and unregister all succeeded RAM blocks - */ - uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, - false, false); uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); /* Cleanup flags and remove reference */ block->flags &= ~RAM_UF_WRITEPROTECT; @@ -1946,9 +1975,6 @@ void ram_write_tracking_stop(void) if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { continue; } - /* Remove protection and unregister all affected RAM blocks */ - uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, - false, false); uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, @@ -2319,8 +2345,25 @@ static void pss_host_page_prepare(PageSearchStatus *pss) size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; pss->host_page_sending = true; - pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); - pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); + if (guest_pfns <= 1) { + /* + * This covers both when guest psize == host psize, or when guest + * has larger psize than the host (guest_pfns==0). + * + * For the latter, we always send one whole guest page per + * iteration of the host page (example: an Alpha VM on x86 host + * will have guest psize 8K while host psize 4K). + */ + pss->host_page_start = pss->page; + pss->host_page_end = pss->page + 1; + } else { + /* + * The host page spans over multiple guest pages, we send them + * within the same host page iteration. + */ + pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); + pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); + } } /* @@ -3392,19 +3435,35 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return 0; } -static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only) +static void ram_state_pending_estimate(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) { RAMState **temp = opaque; RAMState *rs = *temp; - uint64_t remaining_size; - remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; + uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; - if (!migration_in_postcopy() && - remaining_size < max_size) { + if (migrate_postcopy_ram()) { + /* We can do postcopy, and all the data is postcopiable */ + *res_postcopy_only += remaining_size; + } else { + *res_precopy_only += remaining_size; + } +} + +static void ram_state_pending_exact(void *opaque, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) +{ + RAMState **temp = opaque; + RAMState *rs = *temp; + + uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; + + if (!migration_in_postcopy()) { qemu_mutex_lock_iothread(); WITH_RCU_READ_LOCK_GUARD() { migration_bitmap_sync_precopy(rs); @@ -4091,12 +4150,6 @@ int ram_load_postcopy(QEMUFile *f, int channel) return ret; } -static bool postcopy_is_advised(void) -{ - PostcopyState ps = postcopy_state_get(); - return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; -} - static bool postcopy_is_running(void) { PostcopyState ps = postcopy_state_get(); @@ -4167,7 +4220,7 @@ static int ram_load_precopy(QEMUFile *f) MigrationIncomingState *mis = migration_incoming_get_current(); int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; /* ADVISE is earlier, it shows the source has the postcopy capability on */ - bool postcopy_advised = postcopy_is_advised(); + bool postcopy_advised = migration_incoming_postcopy_advised(); if (!migrate_use_compression()) { invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; } @@ -4560,7 +4613,8 @@ static SaveVMHandlers savevm_ram_handlers = { .save_live_complete_postcopy = ram_save_complete, .save_live_complete_precopy = ram_save_complete, .has_postcopy = ram_has_postcopy, - .save_live_pending = ram_save_pending, + .state_pending_exact = ram_state_pending_exact, + .state_pending_estimate = ram_state_pending_estimate, .load_state = ram_load, .save_cleanup = ram_save_cleanup, .load_setup = ram_load_setup, diff --git a/migration/rdma.c b/migration/rdma.c index 94a55dd95b..288eadc2d2 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -2785,7 +2785,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, rdma = qatomic_rcu_read(&rioc->rdmaout); if (!rdma) { - return -EIO; + error_setg(errp, "RDMA control channel output is not set"); + return -1; } CHECK_ERROR_STATE(); @@ -2797,7 +2798,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, ret = qemu_rdma_write_flush(f, rdma); if (ret < 0) { rdma->error_state = ret; - return ret; + error_setg(errp, "qemu_rdma_write_flush returned %d", ret); + return -1; } for (i = 0; i < niov; i++) { @@ -2816,7 +2818,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, if (ret < 0) { rdma->error_state = ret; - return ret; + error_setg(errp, "qemu_rdma_exchange_send returned %d", ret); + return -1; } data += len; @@ -2854,6 +2857,7 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); @@ -2867,7 +2871,8 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, rdma = qatomic_rcu_read(&rioc->rdmain); if (!rdma) { - return -EIO; + error_setg(errp, "RDMA control channel input is not set"); + return -1; } CHECK_ERROR_STATE(); @@ -2903,7 +2908,8 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, if (ret < 0) { rdma->error_state = ret; - return ret; + error_setg(errp, "qemu_rdma_exchange_recv returned %d", ret); + return -1; } /* diff --git a/migration/savevm.c b/migration/savevm.c index a783789430..e9cf4999ad 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -42,7 +42,6 @@ #include "postcopy-ram.h" #include "qapi/error.h" #include "qapi/qapi-commands-migration.h" -#include "qapi/qmp/json-writer.h" #include "qapi/clone-visitor.h" #include "qapi/qapi-builtin-visit.h" #include "qapi/qmp/qerror.h" @@ -67,6 +66,7 @@ #include "net/announce.h" #include "qemu/yank.h" #include "yank_functions.h" +#include "sysemu/qtest.h" const unsigned int postcopy_ram_discard_version; @@ -586,6 +586,7 @@ static void dump_vmstate_vmsd(FILE *out_file, field++; first = false; } + assert(field->flags == VMS_END); fprintf(out_file, "\n%*s]", indent, ""); } if (vmsd->subsections != NULL) { @@ -804,6 +805,42 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque) } } +/* + * Perform some basic checks on vmsd's at registration + * time. + */ +static void vmstate_check(const VMStateDescription *vmsd) +{ + const VMStateField *field = vmsd->fields; + const VMStateDescription **subsection = vmsd->subsections; + + if (field) { + while (field->name) { + if (field->flags & (VMS_STRUCT | VMS_VSTRUCT)) { + /* Recurse to sub structures */ + vmstate_check(field->vmsd); + } + /* Carry on */ + field++; + } + /* Check for the end of field list canary */ + if (field->flags != VMS_END) { + error_report("VMSTATE not ending with VMS_END: %s", vmsd->name); + g_assert_not_reached(); + } + } + + while (subsection && *subsection) { + /* + * The name of a subsection should start with the name of the + * current object. + */ + assert(!strncmp(vmsd->name, (*subsection)->name, strlen(vmsd->name))); + vmstate_check(*subsection); + subsection++; + } +} + int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id, const VMStateDescription *vmsd, void *opaque, int alias_id, @@ -849,6 +886,11 @@ int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id, } else { se->instance_id = instance_id; } + + /* Perform a recursive sanity check during the test runs */ + if (qtest_enabled()) { + vmstate_check(vmsd); + } assert(!se->compat || se->instance_id == 0); savevm_state_handler_insert(se); return 0; @@ -898,17 +940,6 @@ static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, } } -static int vmstate_save(QEMUFile *f, SaveStateEntry *se, - JSONWriter *vmdesc) -{ - trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); - if (!se->vmsd) { - vmstate_save_old_style(f, se, vmdesc); - return 0; - } - return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc); -} - /* * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL) */ @@ -942,6 +973,43 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se) } } +static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc) +{ + int ret; + + if ((!se->ops || !se->ops->save_state) && !se->vmsd) { + return 0; + } + if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { + trace_savevm_section_skip(se->idstr, se->section_id); + return 0; + } + + trace_savevm_section_start(se->idstr, se->section_id); + save_section_header(f, se, QEMU_VM_SECTION_FULL); + if (vmdesc) { + json_writer_start_object(vmdesc, NULL); + json_writer_str(vmdesc, "name", se->idstr); + json_writer_int64(vmdesc, "instance_id", se->instance_id); + } + + trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); + if (!se->vmsd) { + vmstate_save_old_style(f, se, vmdesc); + } else { + ret = vmstate_save_state(f, se->vmsd, se->opaque, vmdesc); + if (ret) { + return ret; + } + } + + trace_savevm_section_end(se->idstr, se->section_id, 0); + save_section_footer(f, se); + if (vmdesc) { + json_writer_end_object(vmdesc); + } + return 0; +} /** * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the * command and associated data. @@ -1164,12 +1232,27 @@ bool qemu_savevm_state_guest_unplug_pending(void) void qemu_savevm_state_setup(QEMUFile *f) { + MigrationState *ms = migrate_get_current(); SaveStateEntry *se; Error *local_err = NULL; int ret; + ms->vmdesc = json_writer_new(false); + json_writer_start_object(ms->vmdesc, NULL); + json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size()); + json_writer_start_array(ms->vmdesc, "devices"); + trace_savevm_state_setup(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (se->vmsd && se->vmsd->early_setup) { + ret = vmstate_save(f, se, ms->vmdesc); + if (ret) { + qemu_file_set_error(f, ret); + break; + } + continue; + } + if (!se->ops || !se->ops->save_setup) { continue; } @@ -1365,41 +1448,23 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy, bool inactivate_disks) { - g_autoptr(JSONWriter) vmdesc = NULL; + MigrationState *ms = migrate_get_current(); + JSONWriter *vmdesc = ms->vmdesc; int vmdesc_len; SaveStateEntry *se; int ret; - vmdesc = json_writer_new(false); - json_writer_start_object(vmdesc, NULL); - json_writer_int64(vmdesc, "page_size", qemu_target_page_size()); - json_writer_start_array(vmdesc, "devices"); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - - if ((!se->ops || !se->ops->save_state) && !se->vmsd) { - continue; - } - if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { - trace_savevm_section_skip(se->idstr, se->section_id); + if (se->vmsd && se->vmsd->early_setup) { + /* Already saved during qemu_savevm_state_setup(). */ continue; } - trace_savevm_section_start(se->idstr, se->section_id); - - json_writer_start_object(vmdesc, NULL); - json_writer_str(vmdesc, "name", se->idstr); - json_writer_int64(vmdesc, "instance_id", se->instance_id); - - save_section_header(f, se, QEMU_VM_SECTION_FULL); ret = vmstate_save(f, se, vmdesc); if (ret) { qemu_file_set_error(f, ret); return ret; } - trace_savevm_section_end(se->idstr, se->section_id, 0); - save_section_footer(f, se); - - json_writer_end_object(vmdesc); } if (inactivate_disks) { @@ -1428,6 +1493,10 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len); } + /* Free it now to detect any inconsistencies. */ + json_writer_free(vmdesc); + ms->vmdesc = NULL; + return 0; } @@ -1472,10 +1541,9 @@ flush: * the result is split into the amount for units that can and * for units that can't do postcopy. */ -void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only) +void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) { SaveStateEntry *se; @@ -1483,9 +1551,8 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size, *res_compatible = 0; *res_postcopy_only = 0; - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_pending) { + if (!se->ops || !se->ops->state_pending_exact) { continue; } if (se->ops->is_active) { @@ -1493,9 +1560,34 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size, continue; } } - se->ops->save_live_pending(f, se->opaque, threshold_size, - res_precopy_only, res_compatible, - res_postcopy_only); + se->ops->state_pending_exact(se->opaque, + res_precopy_only, res_compatible, + res_postcopy_only); + } +} + +void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) +{ + SaveStateEntry *se; + + *res_precopy_only = 0; + *res_compatible = 0; + *res_postcopy_only = 0; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->state_pending_estimate) { + continue; + } + if (se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + continue; + } + } + se->ops->state_pending_estimate(se->opaque, + res_precopy_only, res_compatible, + res_postcopy_only); } } @@ -1595,21 +1687,10 @@ int qemu_save_device_state(QEMUFile *f) if (se->is_ram) { continue; } - if ((!se->ops || !se->ops->save_state) && !se->vmsd) { - continue; - } - if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { - continue; - } - - save_section_header(f, se, QEMU_VM_SECTION_FULL); - ret = vmstate_save(f, se, NULL); if (ret) { return ret; } - - save_section_footer(f, se); } qemu_put_byte(f, QEMU_VM_EOF); diff --git a/migration/savevm.h b/migration/savevm.h index 6461342cb4..b1901e68d5 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -40,10 +40,12 @@ void qemu_savevm_state_cleanup(void); void qemu_savevm_state_complete_postcopy(QEMUFile *f); int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only, bool inactivate_disks); -void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size, - uint64_t *res_precopy_only, - uint64_t *res_compatible, - uint64_t *res_postcopy_only); +void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only); +void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only); void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); void qemu_savevm_send_open_return_path(QEMUFile *f); int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); diff --git a/migration/threadinfo.c b/migration/threadinfo.c new file mode 100644 index 0000000000..1de8b31855 --- /dev/null +++ b/migration/threadinfo.c @@ -0,0 +1,51 @@ +/* + * Migration Threads info + * + * Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Jiang Jiacheng + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "threadinfo.h" + +static QLIST_HEAD(, MigrationThread) migration_threads; + +MigrationThread *MigrationThreadAdd(const char *name, int thread_id) +{ + MigrationThread *thread = g_new0(MigrationThread, 1); + thread->name = name; + thread->thread_id = thread_id; + + QLIST_INSERT_HEAD(&migration_threads, thread, node); + + return thread; +} + +void MigrationThreadDel(MigrationThread *thread) +{ + if (thread) { + QLIST_REMOVE(thread, node); + g_free(thread); + } +} + +MigrationThreadInfoList *qmp_query_migrationthreads(Error **errp) +{ + MigrationThreadInfoList *head = NULL; + MigrationThreadInfoList **tail = &head; + MigrationThread *thread = NULL; + + QLIST_FOREACH(thread, &migration_threads, node) { + MigrationThreadInfo *info = g_new0(MigrationThreadInfo, 1); + info->name = g_strdup(thread->name); + info->thread_id = thread->thread_id; + + QAPI_LIST_APPEND(tail, info); + } + + return head; +} diff --git a/migration/threadinfo.h b/migration/threadinfo.h new file mode 100644 index 0000000000..4d69423c0a --- /dev/null +++ b/migration/threadinfo.h @@ -0,0 +1,28 @@ +/* + * Migration Threads info + * + * Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Jiang Jiacheng + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/queue.h" +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-migration.h" + +typedef struct MigrationThread MigrationThread; + +struct MigrationThread { + const char *name; /* the name of migration thread */ + int thread_id; /* ID of the underlying host thread */ + QLIST_ENTRY(MigrationThread) node; +}; + +MigrationThread *MigrationThreadAdd(const char *name, int thread_id); + +void MigrationThreadDel(MigrationThread *info); diff --git a/migration/trace-events b/migration/trace-events index 57003edcbd..67b65a70ff 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -150,7 +150,8 @@ migrate_fd_cleanup(void) "" migrate_fd_error(const char *error_desc) "error=%s" migrate_fd_cancel(void) "" migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx" -migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")" +migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")" +migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")" migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d" migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64 migration_completion_file_err(void) "" @@ -330,7 +331,7 @@ send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uin dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d" dirty_bitmap_save_complete_enter(void) "" dirty_bitmap_save_complete_finish(void) "" -dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64 +dirty_bitmap_state_pending(uint64_t pending) "pending %" PRIu64 dirty_bitmap_load_complete(void) "" dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32 dirty_bitmap_load_bits_zeroes(void) "" @@ -355,7 +356,7 @@ migration_block_save_device_dirty(int64_t sector) "Error reading sector %" PRId6 migration_block_flush_blks(const char *action, int submitted, int read_done, int transferred) "%s submitted %d read_done %d transferred %d" migration_block_save(const char *mig_stage, int submitted, int transferred) "Enter save live %s submitted %d transferred %d" migration_block_save_complete(void) "Block migration completed" -migration_block_save_pending(uint64_t pending) "Enter save live pending %" PRIu64 +migration_block_state_pending(uint64_t pending) "Enter save live pending %" PRIu64 # page_cache.c migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64 diff --git a/migration/vmstate.c b/migration/vmstate.c index 924494bda3..83ca4c7d3e 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -154,6 +154,7 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, } field++; } + assert(field->flags == VMS_END); ret = vmstate_subsection_load(f, vmsd, opaque); if (ret != 0) { return ret; @@ -408,6 +409,7 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, } field++; } + assert(field->flags == VMS_END); if (vmdesc) { json_writer_end_array(vmdesc); diff --git a/qapi/migration.json b/qapi/migration.json index 88ecf86ac8..c84fa10e86 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1958,6 +1958,35 @@ { 'command': 'query-vcpu-dirty-limit', 'returns': [ 'DirtyLimitInfo' ] } +## +# @MigrationThreadInfo: +# +# Information about migrationthreads +# +# @name: the name of migration thread +# +# @thread-id: ID of the underlying host thread +# +# Since: 7.2 +## +{ 'struct': 'MigrationThreadInfo', + 'data': {'name': 'str', + 'thread-id': 'int'} } + +## +# @query-migrationthreads: +# +# Returns information of migration threads +# +# data: migration thread name +# +# returns: information about migration threads +# +# Since: 7.2 +## +{ 'command': 'query-migrationthreads', + 'returns': ['MigrationThreadInfo'] } + ## # @snapshot-save: # diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c index 196b78c00d..199227a556 100644 --- a/scsi/qemu-pr-helper.c +++ b/scsi/qemu-pr-helper.c @@ -614,7 +614,7 @@ static int coroutine_fn prh_read(PRHelperClient *client, void *buf, int sz, iov.iov_base = buf; iov.iov_len = sz; n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1, - &fds, &nfds, errp); + &fds, &nfds, 0, errp); if (n_read == QIO_CHANNEL_ERR_BLOCK) { qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN); diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 1dd32c9506..109bc8e7b1 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -61,14 +61,14 @@ static bool uffd_feature_thread_id; #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD) #include #include -#include +#include "qemu/userfaultfd.h" static bool ufd_version_check(void) { struct uffdio_api api_struct; uint64_t ioctl_mask; - int ufd = syscall(__NR_userfaultfd, O_CLOEXEC); + int ufd = uffd_open(O_CLOEXEC); if (ufd == -1) { g_test_message("Skipping test: userfaultfd not available"); diff --git a/tests/qtest/tpm-emu.c b/tests/qtest/tpm-emu.c index 73e0000a2c..f05fe12f01 100644 --- a/tests/qtest/tpm-emu.c +++ b/tests/qtest/tpm-emu.c @@ -115,7 +115,7 @@ void *tpm_emu_ctrl_thread(void *data) int *pfd = NULL; size_t nfd = 0; - qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, &error_abort); + qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, 0, &error_abort); cmd = be32_to_cpu(cmd); g_assert_cmpint(cmd, ==, CMD_SET_DATAFD); g_assert_cmpint(nfd, ==, 1); diff --git a/tests/unit/test-io-channel-socket.c b/tests/unit/test-io-channel-socket.c index b36a5d972a..b964bb202d 100644 --- a/tests/unit/test-io-channel-socket.c +++ b/tests/unit/test-io-channel-socket.c @@ -460,6 +460,7 @@ static void test_io_channel_unix_fd_pass(void) G_N_ELEMENTS(iorecv), &fdrecv, &nfdrecv, + 0, &error_abort); g_assert(nfdrecv == G_N_ELEMENTS(fdsend)); diff --git a/util/userfaultfd.c b/util/userfaultfd.c index f1cd6af2b1..4953b3137d 100644 --- a/util/userfaultfd.c +++ b/util/userfaultfd.c @@ -19,6 +19,15 @@ #include #include +int uffd_open(int flags) +{ +#if defined(__NR_userfaultfd) + return syscall(__NR_userfaultfd, flags); +#else + return -EINVAL; +#endif +} + /** * uffd_query_features: query UFFD features * @@ -32,7 +41,7 @@ int uffd_query_features(uint64_t *features) struct uffdio_api api_struct = { 0 }; int ret = -1; - uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); + uffd_fd = uffd_open(O_CLOEXEC); if (uffd_fd < 0) { trace_uffd_query_features_nosys(errno); return -1; @@ -69,7 +78,7 @@ int uffd_create_fd(uint64_t features, bool non_blocking) uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); - uffd_fd = syscall(__NR_userfaultfd, flags); + uffd_fd = uffd_open(flags); if (uffd_fd < 0) { trace_uffd_create_fd_nosys(errno); return -1; diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index 232984ace6..145eb17c08 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -116,7 +116,7 @@ vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg) * qio_channel_readv_full may have short reads, keeping calling it * until getting VHOST_USER_HDR_SIZE or 0 bytes in total */ - rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err); + rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, 0, &local_err); if (rc < 0) { if (rc == QIO_CHANNEL_ERR_BLOCK) { assert(local_err == NULL);