Migration Pull request

In this try
 - rebase to latest upstream
 - same than previous patch
 - fix compilation on non linux (userfaultfd.h) (me)
 - query-migrationthreads (jiang)
 - fix race on reading MultiFDPages_t.block (zhenzhong)
 - fix flush of zero copy page send reuest  (zhenzhong)
 
 Please apply.
 
 Previous try:
 It includes:
 - David Hildenbrand fixes for virtio-men
 - David Gilbert canary to detect problems
 - Fix for rdma return values (Fiona)
 - Peter Xu uffd_open fixes
 - Peter Xu show right downtime for postcopy
 - manish.mishra msg fix fixes
 - my vfio changes.
 
 Please apply.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEGJn/jt6/WMzuA0uC9IfvGFhy1yMFAmPhobYACgkQ9IfvGFhy
 1yMNaA/9EHDPqrI1HL/VkJG4nNOOsQR7RbburXEberZOzvLjnqpjUD3Ls9qV6rx+
 ieHa5T4imYJFk72Wa5vx4r1/dCjtJD2W6jg5+/0nTvYAHrs1U1VRqpuTr0HiXdbJ
 ZLLCnW5eDyO3eMaOX0MUkgHgL0FNkc/Lq5ViCTFsMu9O9xMuDLLdAC3cdvslKuOu
 X1gKByr9jT817Y9e36amYmRaJKC6Cr/PIekNVFu12HBW79pPusLX8KWEf4RBw4HR
 sPwTvMCR/BwZ0+2Lppan60G5rt/ZxDu40oU7y+RHlfWqevl4hDM84/nhjMvEgzc5
 a4Ahe2ERGLwwnC8z3l7v9+pEzSGzDoPcnRGvZcpUpk68wTDtxd5Bdq8CwmNUfL07
 VzWcYpH0yvmwjBba9jfn9fAVgnG5rVp558XcYLIII3wEToty3UDtm43wSdj2CGr6
 cu+IPAp+n/I5G9SRYBTU9ozJz45ttnEe0hxUtZ4I3MuhzHi1VEDAqTWM/X0LyS41
 TB3Y5B2KKpJYbPyZEH4nyTeetR2k7alTFzahCgKqVfOgL0nJx54petjS1K+B1P72
 g6lhP9WnQ33W+M8S7J/aGEaDJd1lFyFB2Rdjn2ZZnASH/fR9j0mFmXWvulXtjFNp
 Sfim3887+Iv4Uzw4VWEe3mM5Ypi/Ba2CmuTjy/pM08Ey8X1Qs5o=
 =ZQbR
 -----END PGP SIGNATURE-----

Merge tag 'migration-20230206-pull-request' of https://gitlab.com/juan.quintela/qemu into staging

Migration Pull request

In this try
- rebase to latest upstream
- same than previous patch
- fix compilation on non linux (userfaultfd.h) (me)
- query-migrationthreads (jiang)
- fix race on reading MultiFDPages_t.block (zhenzhong)
- fix flush of zero copy page send reuest  (zhenzhong)

Please apply.

Previous try:
It includes:
- David Hildenbrand fixes for virtio-men
- David Gilbert canary to detect problems
- Fix for rdma return values (Fiona)
- Peter Xu uffd_open fixes
- Peter Xu show right downtime for postcopy
- manish.mishra msg fix fixes
- my vfio changes.

Please apply.

# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEGJn/jt6/WMzuA0uC9IfvGFhy1yMFAmPhobYACgkQ9IfvGFhy
# 1yMNaA/9EHDPqrI1HL/VkJG4nNOOsQR7RbburXEberZOzvLjnqpjUD3Ls9qV6rx+
# ieHa5T4imYJFk72Wa5vx4r1/dCjtJD2W6jg5+/0nTvYAHrs1U1VRqpuTr0HiXdbJ
# ZLLCnW5eDyO3eMaOX0MUkgHgL0FNkc/Lq5ViCTFsMu9O9xMuDLLdAC3cdvslKuOu
# X1gKByr9jT817Y9e36amYmRaJKC6Cr/PIekNVFu12HBW79pPusLX8KWEf4RBw4HR
# sPwTvMCR/BwZ0+2Lppan60G5rt/ZxDu40oU7y+RHlfWqevl4hDM84/nhjMvEgzc5
# a4Ahe2ERGLwwnC8z3l7v9+pEzSGzDoPcnRGvZcpUpk68wTDtxd5Bdq8CwmNUfL07
# VzWcYpH0yvmwjBba9jfn9fAVgnG5rVp558XcYLIII3wEToty3UDtm43wSdj2CGr6
# cu+IPAp+n/I5G9SRYBTU9ozJz45ttnEe0hxUtZ4I3MuhzHi1VEDAqTWM/X0LyS41
# TB3Y5B2KKpJYbPyZEH4nyTeetR2k7alTFzahCgKqVfOgL0nJx54petjS1K+B1P72
# g6lhP9WnQ33W+M8S7J/aGEaDJd1lFyFB2Rdjn2ZZnASH/fR9j0mFmXWvulXtjFNp
# Sfim3887+Iv4Uzw4VWEe3mM5Ypi/Ba2CmuTjy/pM08Ey8X1Qs5o=
# =ZQbR
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 07 Feb 2023 00:56:22 GMT
# gpg:                using RSA key 1899FF8EDEBF58CCEE034B82F487EF185872D723
# gpg: Good signature from "Juan Quintela <quintela@redhat.com>" [full]
# gpg:                 aka "Juan Quintela <quintela@trasno.org>" [full]
# Primary key fingerprint: 1899 FF8E DEBF 58CC EE03  4B82 F487 EF18 5872 D723

* tag 'migration-20230206-pull-request' of https://gitlab.com/juan.quintela/qemu: (30 commits)
  migration: save/delete migration thread info
  migration: Introduce interface query-migrationthreads
  multifd: Fix flush of zero copy page send request
  multifd: Fix a race on reading MultiFDPages_t.block
  migration: check magic value for deciding the mapping of channels
  io: Add support for MSG_PEEK for socket channel
  migration/dirtyrate: Show sample pages only in page-sampling mode
  migration: Perform vmsd structure check during tests
  migration: Add canary to VMSTATE_END_OF_LIST
  migration/rdma: fix return value for qio_channel_rdma_{readv,writev}
  migration: Show downtime during postcopy phase
  virtio-mem: Proper support for preallocation with migration
  virtio-mem: Migrate immutable properties early
  virtio-mem: Fail if a memory backend with "prealloc=on" is specified
  migration/ram: Factor out check for advised postcopy
  migration/vmstate: Introduce VMSTATE_WITH_TMP_TEST() and VMSTATE_BITMAP_TEST()
  migration/savevm: Allow immutable device state to be migrated early (i.e., before RAM)
  migration/savevm: Prepare vmdesc json writer in qemu_savevm_state_setup()
  migration/savevm: Move more savevm handling into vmstate_save()
  migration/ram: Optimize ram_write_tracking_start() for RamDiscardManager
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
master
Peter Maydell 2023-02-07 15:16:51 +00:00
commit b86307ecef
53 changed files with 2135 additions and 234 deletions

View File

@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len)
if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
ret = qio_channel_readv_full(s->ioc, &iov, 1,
&msgfds, &msgfds_num,
NULL);
0, NULL);
} else {
ret = qio_channel_readv_full(s->ioc, &iov, 1,
NULL, NULL,
NULL);
0, NULL);
}
if (msgfds_num) {

View File

@ -0,0 +1,7 @@
# Boards:
#
CONFIG_ISAPC=n
CONFIG_I440FX=n
CONFIG_Q35=n
CONFIG_MICROVM=y

View File

@ -0,0 +1,6 @@
# Boards:
#
CONFIG_ISAPC=y
CONFIG_I440FX=y
CONFIG_Q35=y
CONFIG_MICROVM=y

View File

@ -482,15 +482,17 @@ An iterative device must provide:
- A ``load_setup`` function that initialises the data structures on the
destination.
- A ``save_live_pending`` function that is called repeatedly and must
indicate how much more data the iterative data must save. The core
migration code will use this to determine when to pause the CPUs
and complete the migration.
- A ``state_pending_exact`` function that indicates how much more
data we must save. The core migration code will use this to
determine when to pause the CPUs and complete the migration.
- A ``save_live_iterate`` function (called after ``save_live_pending``
when there is significant data still to be sent). It should send
a chunk of data until the point that stream bandwidth limits tell it
to stop. Each call generates one section.
- A ``state_pending_estimate`` function that indicates how much more
data we must save. When the estimated amount is smaller than the
threshold, we call ``state_pending_exact``.
- A ``save_live_iterate`` function should send a chunk of data until
the point that stream bandwidth limits tell it to stop. Each call
generates one section.
- A ``save_live_complete_precopy`` function that must transmit the
last section for the device containing any remaining data.

View File

@ -28,7 +28,7 @@ VFIO implements the device hooks for the iterative approach as follows:
* A ``load_setup`` function that sets up the migration region on the
destination and sets _RESUMING flag in the VFIO device state.
* A ``save_live_pending`` function that reads pending_bytes from the vendor
* A ``state_pending_exact`` function that reads pending_bytes from the vendor
driver, which indicates the amount of data that the vendor driver has yet to
save for the VFIO device.
@ -114,7 +114,7 @@ Live migration save path
(RUNNING, _SETUP, _RUNNING|_SAVING)
|
(RUNNING, _ACTIVE, _RUNNING|_SAVING)
If device is active, get pending_bytes by .save_live_pending()
If device is active, get pending_bytes by .state_pending_exact()
If total pending_bytes >= threshold_size, call .save_live_iterate()
Data of VFIO device for pre-copy phase is copied
Iterate till total pending bytes converge and are less than threshold

View File

@ -41,7 +41,9 @@
#include "hw/virtio/virtio-pci.h"
#include "qom/object_interfaces.h"
GlobalProperty hw_compat_7_2[] = {};
GlobalProperty hw_compat_7_2[] = {
{ "virtio-mem", "x-early-migration", "false" },
};
const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
GlobalProperty hw_compat_7_1[] = {

View File

@ -182,10 +182,10 @@ static int cmma_save_setup(QEMUFile *f, void *opaque)
return 0;
}
static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
static void cmma_state_pending(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
S390StAttribState *sas = S390_STATTRIB(opaque);
S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
@ -371,7 +371,8 @@ static SaveVMHandlers savevm_s390_stattrib_handlers = {
.save_setup = cmma_save_setup,
.save_live_iterate = cmma_save_iterate,
.save_live_complete_precopy = cmma_save_complete,
.save_live_pending = cmma_save_pending,
.state_pending_exact = cmma_state_pending,
.state_pending_estimate = cmma_state_pending,
.save_cleanup = cmma_save_cleanup,
.load_state = cmma_load,
.is_active = cmma_active,

View File

@ -456,11 +456,10 @@ static void vfio_save_cleanup(void *opaque)
trace_vfio_save_cleanup(vbasedev->name);
}
static void vfio_save_pending(QEMUFile *f, void *opaque,
uint64_t threshold_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
static void vfio_state_pending(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
@ -473,7 +472,7 @@ static void vfio_save_pending(QEMUFile *f, void *opaque,
*res_precopy_only += migration->pending_bytes;
trace_vfio_save_pending(vbasedev->name, *res_precopy_only,
trace_vfio_state_pending(vbasedev->name, *res_precopy_only,
*res_postcopy_only, *res_compatible);
}
@ -515,9 +514,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
}
/*
* Reset pending_bytes as .save_live_pending is not called during savevm or
* snapshot case, in such case vfio_update_pending() at the start of this
* function updates pending_bytes.
* Reset pending_bytes as state_pending* are not called during
* savevm or snapshot case, in such case vfio_update_pending() at
* the start of this function updates pending_bytes.
*/
migration->pending_bytes = 0;
trace_vfio_save_iterate(vbasedev->name, data_size);
@ -685,7 +684,8 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
static SaveVMHandlers savevm_vfio_handlers = {
.save_setup = vfio_save_setup,
.save_cleanup = vfio_save_cleanup,
.save_live_pending = vfio_save_pending,
.state_pending_exact = vfio_state_pending,
.state_pending_estimate = vfio_state_pending,
.save_live_iterate = vfio_save_iterate,
.save_live_complete_precopy = vfio_save_complete_precopy,
.save_state = vfio_save_state,

View File

@ -157,7 +157,7 @@ vfio_save_cleanup(const char *name) " (%s)"
vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64
vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64
vfio_save_device_config_state(const char *name) " (%s)"
vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
vfio_state_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d"
vfio_save_complete_precopy(const char *name) " (%s)"
vfio_load_device_config_state(const char *name) " (%s)"

View File

@ -31,6 +31,8 @@
#include CONFIG_DEVICES
#include "trace.h"
static const VMStateDescription vmstate_virtio_mem_device_early;
/*
* We only had legacy x86 guests that did not support
* VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
@ -202,6 +204,30 @@ static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
return ret;
}
static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg,
virtio_mem_range_cb cb)
{
unsigned long first_bit, last_bit;
uint64_t offset, size;
int ret = 0;
first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
while (first_bit < vmem->bitmap_size) {
offset = first_bit * vmem->block_size;
last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
first_bit + 1) - 1;
size = (last_bit - first_bit + 1) * vmem->block_size;
ret = cb(vmem, arg, offset, size);
if (ret) {
break;
}
first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
last_bit + 2);
}
return ret;
}
/*
* Adjust the memory section to cover the intersection with the given range.
*
@ -772,6 +798,12 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
error_setg(errp, "'%s' property specifies an unsupported memdev",
VIRTIO_MEM_MEMDEV_PROP);
return;
} else if (vmem->memdev->prealloc) {
error_setg(errp, "'%s' property specifies a memdev with preallocation"
" enabled: %s. Instead, specify 'prealloc=on' for the"
" virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
object_get_canonical_path_component(OBJECT(vmem->memdev)));
return;
}
if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
@ -872,6 +904,10 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
host_memory_backend_set_mapped(vmem->memdev, true);
vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
if (vmem->early_migration) {
vmstate_register(VMSTATE_IF(vmem), VMSTATE_INSTANCE_ID_ANY,
&vmstate_virtio_mem_device_early, vmem);
}
qemu_register_reset(virtio_mem_system_reset, vmem);
/*
@ -893,6 +929,10 @@ static void virtio_mem_device_unrealize(DeviceState *dev)
*/
memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
qemu_unregister_reset(virtio_mem_system_reset, vmem);
if (vmem->early_migration) {
vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
vmem);
}
vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
host_memory_backend_set_mapped(vmem->memdev, false);
virtio_del_queue(vdev, 0);
@ -922,6 +962,10 @@ static int virtio_mem_post_load(void *opaque, int version_id)
RamDiscardListener *rdl;
int ret;
if (vmem->prealloc && !vmem->early_migration) {
warn_report("Proper preallocation with migration requires a newer QEMU machine");
}
/*
* We started out with all memory discarded and our memory region is mapped
* into an address space. Replay, now that we updated the bitmap.
@ -941,6 +985,64 @@ static int virtio_mem_post_load(void *opaque, int version_id)
return virtio_mem_restore_unplugged(vmem);
}
static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size)
{
void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
int fd = memory_region_get_fd(&vmem->memdev->mr);
Error *local_err = NULL;
qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
if (local_err) {
error_report_err(local_err);
return -ENOMEM;
}
return 0;
}
static int virtio_mem_post_load_early(void *opaque, int version_id)
{
VirtIOMEM *vmem = VIRTIO_MEM(opaque);
RAMBlock *rb = vmem->memdev->mr.ram_block;
int ret;
if (!vmem->prealloc) {
return 0;
}
/*
* We restored the bitmap and verified that the basic properties
* match on source and destination, so we can go ahead and preallocate
* memory for all plugged memory blocks, before actual RAM migration starts
* touching this memory.
*/
ret = virtio_mem_for_each_plugged_range(vmem, NULL,
virtio_mem_prealloc_range_cb);
if (ret) {
return ret;
}
/*
* This is tricky: postcopy wants to start with a clean slate. On
* POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
* preallocated) RAM such that postcopy will work as expected later.
*
* However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
* RAM migration. So let's discard all memory again. This looks like an
* expensive NOP, but actually serves a purpose: we made sure that we
* were able to allocate all required backend memory once. We cannot
* guarantee that the backend memory we will free will remain free
* until we need it during postcopy, but at least we can catch the
* obvious setup issues this way.
*/
if (migration_incoming_postcopy_advised()) {
if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
return -EBUSY;
}
}
return 0;
}
typedef struct VirtIOMEMMigSanityChecks {
VirtIOMEM *parent;
uint64_t addr;
@ -1009,18 +1111,54 @@ static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
},
};
static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
{
const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
/* With early migration, these fields were already migrated. */
return !vmem->early_migration;
}
static const VMStateDescription vmstate_virtio_mem_device = {
.name = "virtio-mem-device",
.minimum_version_id = 1,
.version_id = 1,
.priority = MIG_PRI_VIRTIO_MEM,
.post_load = virtio_mem_post_load,
.fields = (VMStateField[]) {
VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
VirtIOMEMMigSanityChecks,
vmstate_virtio_mem_sanity_checks),
VMSTATE_UINT64(usable_region_size, VirtIOMEM),
VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
VMSTATE_UINT64(requested_size, VirtIOMEM),
VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
0, bitmap_size),
VMSTATE_END_OF_LIST()
},
};
/*
* Transfer properties that are immutable while migration is active early,
* such that we have have this information around before migrating any RAM
* content.
*
* Note that virtio_mem_is_busy() makes sure these properties can no longer
* change on the migration source until migration completed.
*
* With QEMU compat machines, we transmit these properties later, via
* vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
*/
static const VMStateDescription vmstate_virtio_mem_device_early = {
.name = "virtio-mem-device-early",
.minimum_version_id = 1,
.version_id = 1,
.early_setup = true,
.post_load = virtio_mem_post_load_early,
.fields = (VMStateField[]) {
VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
vmstate_virtio_mem_sanity_checks),
VMSTATE_UINT64(usable_region_size, VirtIOMEM),
VMSTATE_UINT64(size, VirtIOMEM),
VMSTATE_UINT64(requested_size, VirtIOMEM),
VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
VMSTATE_END_OF_LIST()
},
@ -1205,6 +1343,8 @@ static Property virtio_mem_properties[] = {
DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
unplugged_inaccessible, ON_OFF_AUTO_AUTO),
#endif
DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
early_migration, true),
DEFINE_PROP_END_OF_LIST(),
};

View File

@ -31,6 +31,7 @@ OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass,
#define VIRTIO_MEM_BLOCK_SIZE_PROP "block-size"
#define VIRTIO_MEM_ADDR_PROP "memaddr"
#define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "unplugged-inaccessible"
#define VIRTIO_MEM_EARLY_MIGRATION_PROP "x-early-migration"
#define VIRTIO_MEM_PREALLOC_PROP "prealloc"
struct VirtIOMEM {
@ -74,6 +75,13 @@ struct VirtIOMEM {
/* whether to prealloc memory when plugging new blocks */
bool prealloc;
/*
* Whether we migrate properties that are immutable while migration is
* active early, before state of other devices and especially, before
* migrating any RAM content.
*/
bool early_migration;
/* notifiers to notify when "size" changes */
NotifierList size_change_notifiers;

View File

@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
#define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
typedef enum QIOChannelFeature QIOChannelFeature;
enum QIOChannelFeature {
@ -41,6 +43,7 @@ enum QIOChannelFeature {
QIO_CHANNEL_FEATURE_SHUTDOWN,
QIO_CHANNEL_FEATURE_LISTEN,
QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
};
@ -114,6 +117,7 @@ struct QIOChannelClass {
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp);
int (*io_close)(QIOChannel *ioc,
Error **errp);
@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
* @niov: the length of the @iov array
* @fds: pointer to an array that will received file handles
* @nfds: pointer filled with number of elements in @fds on return
* @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
* @errp: pointer to a NULL-initialized error object
*
* Read data from the IO channel, storing it in the
@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp);

View File

@ -67,8 +67,10 @@ bool migration_has_failed(MigrationState *);
/* ...and after the device transmission */
bool migration_in_postcopy_after_devices(MigrationState *);
void migration_global_dump(Monitor *mon);
/* True if incomming migration entered POSTCOPY_INCOMING_DISCARD */
/* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */
bool migration_in_incoming_postcopy(void);
/* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */
bool migration_incoming_postcopy_advised(void);
/* True if background snapshot is active */
bool migration_in_bg_snapshot(void);

View File

@ -46,11 +46,6 @@ typedef struct SaveVMHandlers {
/* This runs outside the iothread lock! */
int (*save_setup)(QEMUFile *f, void *opaque);
void (*save_live_pending)(QEMUFile *f, void *opaque,
uint64_t threshold_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only);
/* Note for save_live_pending:
* - res_precopy_only is for data which must be migrated in precopy phase
* or in stopped state, in other words - before target vm start
@ -61,8 +56,16 @@ typedef struct SaveVMHandlers {
* Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
* whole amount of pending data.
*/
/* This estimates the remaining data to transfer */
void (*state_pending_estimate)(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only);
/* This calculate the exact remaining data to transfer */
void (*state_pending_exact)(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only);
LoadStateHandler *load_state;
int (*load_setup)(QEMUFile *f, void *opaque);
int (*load_cleanup)(void *opaque);

View File

@ -147,6 +147,9 @@ enum VMStateFlags {
* VMStateField.struct_version_id to tell which version of the
* structure we are referencing to use. */
VMS_VSTRUCT = 0x8000,
/* Marker for end of list */
VMS_END = 0x10000
};
typedef enum {
@ -178,7 +181,21 @@ struct VMStateField {
struct VMStateDescription {
const char *name;
int unmigratable;
bool unmigratable;
/*
* This VMSD describes something that should be sent during setup phase
* of migration. It plays similar role as save_setup() for explicitly
* registered vmstate entries, so it can be seen as a way to describe
* save_setup() in VMSD structures.
*
* Note that for now, a SaveStateEntry cannot have a VMSD and
* operations (e.g., save_setup()) set at the same time. Consequently,
* save_setup() and a VMSD with early_setup set to true are mutually
* exclusive. For this reason, also early_setup VMSDs are migrated in a
* QEMU_VM_SECTION_FULL section, while save_setup() data is migrated in
* a QEMU_VM_SECTION_START section.
*/
bool early_setup;
int version_id;
int minimum_version_id;
MigrationPriority priority;
@ -705,8 +722,9 @@ extern const VMStateInfo vmstate_info_qlist;
* '_state' type
* That the pointer is right at the start of _tmp_type.
*/
#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) { \
#define VMSTATE_WITH_TMP_TEST(_state, _test, _tmp_type, _vmsd) { \
.name = "tmp", \
.field_exists = (_test), \
.size = sizeof(_tmp_type) + \
QEMU_BUILD_BUG_ON_ZERO(offsetof(_tmp_type, parent) != 0) + \
type_check_pointer(_state, \
@ -715,6 +733,9 @@ extern const VMStateInfo vmstate_info_qlist;
.info = &vmstate_info_tmp, \
}
#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) \
VMSTATE_WITH_TMP_TEST(_state, NULL, _tmp_type, _vmsd)
#define VMSTATE_UNUSED_BUFFER(_test, _version, _size) { \
.name = "unused", \
.field_exists = (_test), \
@ -738,8 +759,9 @@ extern const VMStateInfo vmstate_info_qlist;
/* _field_size should be a int32_t field in the _state struct giving the
* size of the bitmap _field in bits.
*/
#define VMSTATE_BITMAP(_field, _state, _version, _field_size) { \
#define VMSTATE_BITMAP_TEST(_field, _state, _test, _version, _field_size) { \
.name = (stringify(_field)), \
.field_exists = (_test), \
.version_id = (_version), \
.size_offset = vmstate_offset_value(_state, _field_size, int32_t),\
.info = &vmstate_info_bitmap, \
@ -747,6 +769,9 @@ extern const VMStateInfo vmstate_info_qlist;
.offset = offsetof(_state, _field), \
}
#define VMSTATE_BITMAP(_field, _state, _version, _field_size) \
VMSTATE_BITMAP_TEST(_field, _state, NULL, _version, _field_size)
/* For migrating a QTAILQ.
* Target QTAILQ needs be properly initialized.
* _type: type of QTAILQ element
@ -1161,7 +1186,9 @@ extern const VMStateInfo vmstate_info_qlist;
VMSTATE_UNUSED_BUFFER(_test, 0, _size)
#define VMSTATE_END_OF_LIST() \
{}
{ \
.flags = VMS_END, \
}
int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
void *opaque, int version_id);

View File

@ -13,10 +13,20 @@
#ifndef USERFAULTFD_H
#define USERFAULTFD_H
#ifdef CONFIG_LINUX
#include "qemu/osdep.h"
#include "exec/hwaddr.h"
#include <linux/userfaultfd.h>
/**
* uffd_open(): Open an userfaultfd handle for current context.
*
* @flags: The flags we want to pass in when creating the handle.
*
* Returns: the uffd handle if >=0, or <0 if error happens.
*/
int uffd_open(int flags);
int uffd_query_features(uint64_t *features);
int uffd_create_fd(uint64_t features, bool non_blocking);
void uffd_close_fd(int uffd_fd);
@ -32,4 +42,6 @@ int uffd_wakeup(int uffd_fd, void *addr, uint64_t length);
int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count);
bool uffd_poll_events(int uffd_fd, int tmo);
#endif /* CONFIG_LINUX */
#endif /* USERFAULTFD_H */

View File

@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);

View File

@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);

View File

@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);

View File

@ -60,6 +60,7 @@ qio_channel_null_readv(QIOChannel *ioc,
size_t niov,
int **fds G_GNUC_UNUSED,
size_t *nfds G_GNUC_UNUSED,
int flags,
Error **errp)
{
QIOChannelNull *nioc = QIO_CHANNEL_NULL(ioc);

View File

@ -173,6 +173,9 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
}
#endif
qio_channel_set_feature(QIO_CHANNEL(ioc),
QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
return 0;
}
@ -406,6 +409,9 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
}
#endif /* WIN32 */
qio_channel_set_feature(QIO_CHANNEL(cioc),
QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
trace_qio_channel_socket_accept_complete(ioc, cioc, cioc->fd);
return cioc;
@ -496,6 +502,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
@ -517,6 +524,10 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
}
if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
sflags |= MSG_PEEK;
}
retry:
ret = recvmsg(sioc->fd, &msg, sflags);
if (ret < 0) {
@ -624,11 +635,17 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
ssize_t done = 0;
ssize_t i;
int sflags = 0;
if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
sflags |= MSG_PEEK;
}
for (i = 0; i < niov; i++) {
ssize_t ret;
@ -636,7 +653,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
ret = recv(sioc->fd,
iov[i].iov_base,
iov[i].iov_len,
0);
sflags);
if (ret < 0) {
if (errno == EAGAIN) {
if (done) {

View File

@ -260,6 +260,7 @@ static ssize_t qio_channel_tls_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);

View File

@ -1081,6 +1081,7 @@ static ssize_t qio_channel_websock_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc);

View File

@ -52,6 +52,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
@ -63,7 +64,14 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
return -1;
}
return klass->io_readv(ioc, iov, niov, fds, nfds, errp);
if ((flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) &&
!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
error_setg_errno(errp, EINVAL,
"Channel does not support peek read");
return -1;
}
return klass->io_readv(ioc, iov, niov, fds, nfds, flags, errp);
}
@ -146,7 +154,7 @@ int qio_channel_readv_full_all_eof(QIOChannel *ioc,
while ((nlocal_iov > 0) || local_fds) {
ssize_t len;
len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
local_nfds, errp);
local_nfds, 0, errp);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
qio_channel_yield(ioc, G_IO_IN);
@ -284,7 +292,7 @@ ssize_t qio_channel_readv(QIOChannel *ioc,
size_t niov,
Error **errp)
{
return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, errp);
return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, 0, errp);
}
@ -303,7 +311,7 @@ ssize_t qio_channel_read(QIOChannel *ioc,
Error **errp)
{
struct iovec iov = { .iov_base = buf, .iov_len = buflen };
return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, errp);
return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, 0, errp);
}

View File

@ -762,11 +762,10 @@ static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
return 0;
}
static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
uint64_t max_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
static void dirty_bitmap_state_pending(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
DBMSaveState *s = &((DBMState *)opaque)->save;
SaveBitmapState *dbms;
@ -784,7 +783,7 @@ static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
qemu_mutex_unlock_iothread();
trace_dirty_bitmap_save_pending(pending, max_size);
trace_dirty_bitmap_state_pending(pending);
*res_postcopy_only += pending;
}
@ -1253,7 +1252,8 @@ static SaveVMHandlers savevm_dirty_bitmap_handlers = {
.save_live_complete_postcopy = dirty_bitmap_save_complete,
.save_live_complete_precopy = dirty_bitmap_save_complete,
.has_postcopy = dirty_bitmap_has_postcopy,
.save_live_pending = dirty_bitmap_save_pending,
.state_pending_exact = dirty_bitmap_state_pending,
.state_pending_estimate = dirty_bitmap_state_pending,
.save_live_iterate = dirty_bitmap_save_iterate,
.is_active_iterate = dirty_bitmap_is_active_iterate,
.load_state = dirty_bitmap_load,

View File

@ -863,10 +863,10 @@ static int block_save_complete(QEMUFile *f, void *opaque)
return 0;
}
static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
static void block_state_pending(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
/* Estimate pending number of bytes to send */
uint64_t pending;
@ -885,7 +885,7 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
pending = BLK_MIG_BLOCK_SIZE;
}
trace_migration_block_save_pending(pending);
trace_migration_block_state_pending(pending);
/* We don't do postcopy */
*res_precopy_only += pending;
}
@ -1020,7 +1020,8 @@ static SaveVMHandlers savevm_block_handlers = {
.save_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
.save_live_complete_precopy = block_save_complete,
.save_live_pending = block_save_pending,
.state_pending_exact = block_state_pending,
.state_pending_estimate = block_state_pending,
.load_state = block_load,
.save_cleanup = block_migration_cleanup,
.is_active = block_is_active,

View File

@ -53,6 +53,7 @@ qio_channel_block_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);

View File

@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s,
migrate_fd_connect(s, error);
error_free(error);
}
/**
* @migration_channel_read_peek - Peek at migration channel, without
* actually removing it from channel buffer.
*
* @ioc: the channel object
* @buf: the memory region to read data into
* @buflen: the number of bytes to read in @buf
* @errp: pointer to a NULL-initialized error object
*
* Returns 0 if successful, returns -1 and sets @errp if fails.
*/
int migration_channel_read_peek(QIOChannel *ioc,
const char *buf,
const size_t buflen,
Error **errp)
{
ssize_t len = 0;
struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
while (true) {
len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL,
QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
error_setg(errp,
"Failed to peek at channel");
return -1;
}
if (len == buflen) {
break;
}
/* 1ms sleep. */
if (qemu_in_coroutine()) {
qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
} else {
g_usleep(1000);
}
}
return 0;
}

View File

@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s,
QIOChannel *ioc,
const char *hostname,
Error *error_in);
int migration_channel_read_peek(QIOChannel *ioc,
const char *buf,
const size_t buflen,
Error **errp);
#endif

View File

@ -714,8 +714,8 @@ void qmp_calc_dirty_rate(int64_t calc_time,
mode = DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
}
if (has_sample_pages && mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
error_setg(errp, "either sample-pages or dirty-ring can be specified.");
if (has_sample_pages && mode != DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
error_setg(errp, "sample-pages is used only in page-sampling mode");
return;
}
@ -785,8 +785,10 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict)
DirtyRateStatus_str(info->status));
monitor_printf(mon, "Start Time: %"PRIi64" (ms)\n",
info->start_time);
monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
info->sample_pages);
if (info->mode == DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
info->sample_pages);
}
monitor_printf(mon, "Period: %"PRIi64" (sec)\n",
info->calc_time);
monitor_printf(mon, "Mode: %s\n",

View File

@ -26,6 +26,7 @@ softmmu_ss.add(files(
'savevm.c',
'socket.c',
'tls.c',
'threadinfo.c',
), gnutls)
softmmu_ss.add(when: rdma, if_true: files('rdma.c'))

View File

@ -31,6 +31,7 @@
#include "migration.h"
#include "savevm.h"
#include "qemu-file.h"
#include "channel.h"
#include "migration/vmstate.h"
#include "block/block.h"
#include "qapi/error.h"
@ -57,6 +58,7 @@
#include "net/announce.h"
#include "qemu/queue.h"
#include "multifd.h"
#include "threadinfo.h"
#include "qemu/yank.h"
#include "sysemu/cpus.h"
#include "yank_functions.h"
@ -664,10 +666,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error **errp)
{
MigrationIncomingState *mis = migration_incoming_get_current();
if (multifd_load_setup(errp) != 0) {
return false;
}
if (!mis->from_src_file) {
mis->from_src_file = f;
}
@ -734,31 +732,56 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
{
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
bool start_migration;
QEMUFile *f;
bool default_channel = true;
uint32_t channel_magic = 0;
int ret = 0;
if (!mis->from_src_file) {
/* The first connection (multifd may have multiple) */
if (migrate_use_multifd() && !migrate_postcopy_ram() &&
qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
/*
* With multiple channels, it is possible that we receive channels
* out of order on destination side, causing incorrect mapping of
* source channels on destination side. Check channel MAGIC to
* decide type of channel. Please note this is best effort, postcopy
* preempt channel does not send any magic number so avoid it for
* postcopy live migration. Also tls live migration already does
* tls handshake while initializing main channel so with tls this
* issue is not possible.
*/
ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
sizeof(channel_magic), &local_err);
if (ret != 0) {
error_propagate(errp, local_err);
return;
}
default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
} else {
default_channel = !mis->from_src_file;
}
if (multifd_load_setup(errp) != 0) {
error_setg(errp, "Failed to setup multifd channels");
return;
}
if (default_channel) {
f = qemu_file_new_input(ioc);
if (!migration_incoming_setup(f, errp)) {
return;
}
/*
* Common migration only needs one channel, so we can start
* right now. Some features need more than one channel, we wait.
*/
start_migration = !migration_needs_multiple_sockets();
} else {
/* Multiple connections */
assert(migration_needs_multiple_sockets());
if (migrate_use_multifd()) {
start_migration = multifd_recv_new_channel(ioc, &local_err);
multifd_recv_new_channel(ioc, &local_err);
} else {
assert(migrate_postcopy_preempt());
f = qemu_file_new_input(ioc);
start_migration = postcopy_preempt_new_channel(mis, f);
postcopy_preempt_new_channel(mis, f);
}
if (local_err) {
error_propagate(errp, local_err);
@ -766,7 +789,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
}
}
if (start_migration) {
if (migration_has_all_channels()) {
/* If it's a recovery, we're done */
if (postcopy_try_recover()) {
return;
@ -1051,20 +1074,30 @@ bool migration_is_running(int state)
}
}
static bool migrate_show_downtime(MigrationState *s)
{
return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy();
}
static void populate_time_info(MigrationInfo *info, MigrationState *s)
{
info->has_status = true;
info->has_setup_time = true;
info->setup_time = s->setup_time;
if (s->state == MIGRATION_STATUS_COMPLETED) {
info->has_total_time = true;
info->total_time = s->total_time;
info->has_downtime = true;
info->downtime = s->downtime;
} else {
info->has_total_time = true;
info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
s->start_time;
}
if (migrate_show_downtime(s)) {
info->has_downtime = true;
info->downtime = s->downtime;
} else {
info->has_expected_downtime = true;
info->expected_downtime = s->expected_downtime;
}
@ -1933,6 +1966,8 @@ static void migrate_fd_cleanup(MigrationState *s)
g_free(s->hostname);
s->hostname = NULL;
json_writer_free(s->vmdesc);
s->vmdesc = NULL;
qemu_savevm_state_cleanup();
@ -2124,6 +2159,13 @@ bool migration_in_incoming_postcopy(void)
return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
}
bool migration_incoming_postcopy_advised(void)
{
PostcopyState ps = postcopy_state_get();
return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
}
bool migration_in_bg_snapshot(void)
{
MigrationState *s = migrate_get_current();
@ -3778,33 +3820,39 @@ typedef enum {
*/
static MigIterateState migration_iteration_run(MigrationState *s)
{
uint64_t pending_size, pend_pre, pend_compat, pend_post;
uint64_t pend_pre, pend_compat, pend_post;
bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
&pend_compat, &pend_post);
pending_size = pend_pre + pend_compat + pend_post;
qemu_savevm_state_pending_estimate(&pend_pre, &pend_compat, &pend_post);
uint64_t pending_size = pend_pre + pend_compat + pend_post;
trace_migrate_pending(pending_size, s->threshold_size,
pend_pre, pend_compat, pend_post);
trace_migrate_pending_estimate(pending_size,
pend_pre, pend_compat, pend_post);
if (pending_size && pending_size >= s->threshold_size) {
/* Still a significant amount to transfer */
if (!in_postcopy && pend_pre <= s->threshold_size &&
qatomic_read(&s->start_postcopy)) {
if (postcopy_start(s)) {
error_report("%s: postcopy failed to start", __func__);
}
return MIG_ITERATE_SKIP;
}
/* Just another iteration step */
qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
} else {
if (pend_pre + pend_compat <= s->threshold_size) {
qemu_savevm_state_pending_exact(&pend_pre, &pend_compat, &pend_post);
pending_size = pend_pre + pend_compat + pend_post;
trace_migrate_pending_exact(pending_size,
pend_pre, pend_compat, pend_post);
}
if (!pending_size || pending_size < s->threshold_size) {
trace_migration_thread_low_pending(pending_size);
migration_completion(s);
return MIG_ITERATE_BREAK;
}
/* Still a significant amount to transfer */
if (!in_postcopy && pend_pre <= s->threshold_size &&
qatomic_read(&s->start_postcopy)) {
if (postcopy_start(s)) {
error_report("%s: postcopy failed to start", __func__);
}
return MIG_ITERATE_SKIP;
}
/* Just another iteration step */
qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
return MIG_ITERATE_RESUME;
}
@ -3981,10 +4029,13 @@ static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
static void *migration_thread(void *opaque)
{
MigrationState *s = opaque;
MigrationThread *thread = NULL;
int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
MigThrError thr_error;
bool urgent = false;
thread = MigrationThreadAdd("live_migration", qemu_get_thread_id());
rcu_register_thread();
object_ref(OBJECT(s));
@ -4061,6 +4112,7 @@ static void *migration_thread(void *opaque)
migration_iteration_finish(s);
object_unref(OBJECT(s));
rcu_unregister_thread();
MigrationThreadDel(thread);
return NULL;
}

View File

@ -17,6 +17,7 @@
#include "exec/cpu-common.h"
#include "hw/qdev-core.h"
#include "qapi/qapi-types-migration.h"
#include "qapi/qmp/json-writer.h"
#include "qemu/thread.h"
#include "qemu/coroutine_int.h"
#include "io/channel.h"
@ -366,6 +367,9 @@ struct MigrationState {
* This save hostname when out-going migration starts
*/
char *hostname;
/* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
JSONWriter *vmdesc;
};
void migrate_set_state(int *state, int old_state, int new_state);

View File

@ -24,6 +24,7 @@
#include "qemu-file.h"
#include "trace.h"
#include "multifd.h"
#include "threadinfo.h"
#include "qemu/yank.h"
#include "io/channel-socket.h"
@ -442,6 +443,7 @@ static int multifd_send_pages(QEMUFile *f)
int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
{
MultiFDPages_t *pages = multifd_send_state->pages;
bool changed = false;
if (!pages->block) {
pages->block = block;
@ -454,14 +456,16 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
if (pages->num < pages->allocated) {
return 1;
}
} else {
changed = true;
}
if (multifd_send_pages(f) < 0) {
return -1;
}
if (pages->block != block) {
return multifd_queue_page(f, block, offset);
if (changed) {
return multifd_queue_page(f, block, offset);
}
return 1;
@ -627,16 +631,16 @@ int multifd_send_sync_main(QEMUFile *f)
stat64_add(&ram_atomic_counters.transferred, p->packet_len);
qemu_mutex_unlock(&p->mutex);
qemu_sem_post(&p->sem);
if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
return -1;
}
}
for (i = 0; i < migrate_multifd_channels(); i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];
trace_multifd_send_sync_main_wait(p->id);
qemu_sem_wait(&p->sem_sync);
if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
return -1;
}
}
trace_multifd_send_sync_main(multifd_send_state->packet_num);
@ -646,10 +650,13 @@ int multifd_send_sync_main(QEMUFile *f)
static void *multifd_send_thread(void *opaque)
{
MultiFDSendParams *p = opaque;
MigrationThread *thread = NULL;
Error *local_err = NULL;
int ret = 0;
bool use_zero_copy_send = migrate_use_zero_copy_send();
thread = MigrationThreadAdd(p->name, qemu_get_thread_id());
trace_multifd_send_thread_start(p->id);
rcu_register_thread();
@ -759,6 +766,7 @@ out:
qemu_mutex_unlock(&p->mutex);
rcu_unregister_thread();
MigrationThreadDel(thread);
trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages);
return NULL;
@ -1164,9 +1172,14 @@ int multifd_load_setup(Error **errp)
uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
uint8_t i;
if (!migrate_use_multifd()) {
/*
* Return successfully if multiFD recv state is already initialised
* or multiFD is not enabled.
*/
if (multifd_recv_state || !migrate_use_multifd()) {
return 0;
}
if (!migrate_multi_channels_is_allowed()) {
error_setg(errp, "multifd is not supported by current protocol");
return -1;
@ -1227,11 +1240,9 @@ bool multifd_recv_all_channels_created(void)
/*
* Try to receive all multifd channels to get ready for the migration.
* - Return true and do not set @errp when correctly receiving all channels;
* - Return false and do not set @errp when correctly receiving the current one;
* - Return false and set @errp when failing to receive the current channel.
* Sets @errp when failing to receive the current channel.
*/
bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
{
MultiFDRecvParams *p;
Error *local_err = NULL;
@ -1244,7 +1255,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
"failed to receive packet"
" via multifd channel %d: ",
qatomic_read(&multifd_recv_state->count));
return false;
return;
}
trace_multifd_recv_new_channel(id);
@ -1254,7 +1265,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
id);
multifd_recv_terminate_threads(local_err);
error_propagate(errp, local_err);
return false;
return;
}
p->c = ioc;
object_ref(OBJECT(ioc));
@ -1265,6 +1276,4 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
QEMU_THREAD_JOINABLE);
qatomic_inc(&multifd_recv_state->count);
return qatomic_read(&multifd_recv_state->count) ==
migrate_multifd_channels();
}

1274
migration/multifd.c.orig Normal file

File diff suppressed because it is too large Load Diff

View File

@ -18,7 +18,7 @@ void multifd_save_cleanup(void);
int multifd_load_setup(Error **errp);
int multifd_load_cleanup(Error **errp);
bool multifd_recv_all_channels_created(void);
bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
void multifd_recv_sync_main(void);
int multifd_send_sync_main(QEMUFile *f);
int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);

View File

@ -37,6 +37,7 @@
#include "qemu-file.h"
#include "yank_functions.h"
#include "tls.h"
#include "qemu/userfaultfd.h"
/* Arbitrary limit on size of each discard command,
* keeps them around ~200 bytes
@ -226,11 +227,9 @@ static bool receive_ufd_features(uint64_t *features)
int ufd;
bool ret = true;
/* if we are here __NR_userfaultfd should exists */
ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
ufd = uffd_open(O_CLOEXEC);
if (ufd == -1) {
error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
strerror(errno));
error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
return false;
}
@ -375,7 +374,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
goto out;
}
ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
ufd = uffd_open(O_CLOEXEC);
if (ufd == -1) {
error_report("%s: userfaultfd not available: %s", __func__,
strerror(errno));
@ -1160,7 +1159,7 @@ static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
{
/* Open the fd for the kernel to give us userfaults */
mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
if (mis->userfault_fd == -1) {
error_report("%s: Failed to open userfault fd: %s", __func__,
strerror(errno));
@ -1539,7 +1538,7 @@ void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
}
}
bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
{
/*
* The new loading channel has its own threads, so it needs to be
@ -1548,9 +1547,6 @@ bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
qemu_file_set_blocking(file, true);
mis->postcopy_qemufile_dst = file;
trace_postcopy_preempt_new_channel();
/* Start the migration immediately */
return true;
}
/*

View File

@ -190,7 +190,7 @@ enum PostcopyChannels {
RAM_CHANNEL_MAX,
};
bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
int postcopy_preempt_setup(MigrationState *s, Error **errp);
int postcopy_preempt_wait_channel(MigrationState *s);

View File

@ -1774,13 +1774,15 @@ out:
static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
ram_addr_t size)
{
const ram_addr_t end = offset + size;
/*
* We read one byte of each page; this will preallocate page tables if
* required and populate the shared zeropage on MAP_PRIVATE anonymous memory
* where no page was populated yet. This might require adaption when
* supporting other mappings, like shmem.
*/
for (; offset < size; offset += block->page_size) {
for (; offset < end; offset += block->page_size) {
char tmp = *((char *)block->host + offset);
/* Don't optimize the read out */
@ -1863,6 +1865,39 @@ void ram_write_tracking_prepare(void)
}
}
static inline int uffd_protect_section(MemoryRegionSection *section,
void *opaque)
{
const hwaddr size = int128_get64(section->size);
const hwaddr offset = section->offset_within_region;
RAMBlock *rb = section->mr->ram_block;
int uffd_fd = (uintptr_t)opaque;
return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
false);
}
static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
{
assert(rb->flags & RAM_UF_WRITEPROTECT);
/* See ram_block_populate_read() */
if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
MemoryRegionSection section = {
.mr = rb->mr,
.offset_within_region = 0,
.size = rb->mr->size,
};
return ram_discard_manager_replay_populated(rdm, &section,
uffd_protect_section,
(void *)(uintptr_t)uffd_fd);
}
return uffd_change_protection(uffd_fd, rb->host,
rb->used_length, true, false);
}
/*
* ram_write_tracking_start: start UFFD-WP memory tracking
*
@ -1894,14 +1929,14 @@ int ram_write_tracking_start(void)
block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
goto fail;
}
/* Apply UFFD write protection to the block memory range */
if (uffd_change_protection(rs->uffdio_fd, block->host,
block->max_length, true, false)) {
goto fail;
}
block->flags |= RAM_UF_WRITEPROTECT;
memory_region_ref(block->mr);
/* Apply UFFD write protection to the block memory range */
if (ram_block_uffd_protect(block, uffd_fd)) {
goto fail;
}
trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
block->host, block->max_length);
}
@ -1915,12 +1950,6 @@ fail:
if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
continue;
}
/*
* In case some memory block failed to be write-protected
* remove protection and unregister all succeeded RAM blocks
*/
uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
false, false);
uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
/* Cleanup flags and remove reference */
block->flags &= ~RAM_UF_WRITEPROTECT;
@ -1946,9 +1975,6 @@ void ram_write_tracking_stop(void)
if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
continue;
}
/* Remove protection and unregister all affected RAM blocks */
uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
false, false);
uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
@ -2319,8 +2345,25 @@ static void pss_host_page_prepare(PageSearchStatus *pss)
size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
pss->host_page_sending = true;
pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
if (guest_pfns <= 1) {
/*
* This covers both when guest psize == host psize, or when guest
* has larger psize than the host (guest_pfns==0).
*
* For the latter, we always send one whole guest page per
* iteration of the host page (example: an Alpha VM on x86 host
* will have guest psize 8K while host psize 4K).
*/
pss->host_page_start = pss->page;
pss->host_page_end = pss->page + 1;
} else {
/*
* The host page spans over multiple guest pages, we send them
* within the same host page iteration.
*/
pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
}
}
/*
@ -3392,19 +3435,35 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return 0;
}
static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
static void ram_state_pending_estimate(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
RAMState **temp = opaque;
RAMState *rs = *temp;
uint64_t remaining_size;
remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
if (!migration_in_postcopy() &&
remaining_size < max_size) {
if (migrate_postcopy_ram()) {
/* We can do postcopy, and all the data is postcopiable */
*res_postcopy_only += remaining_size;
} else {
*res_precopy_only += remaining_size;
}
}
static void ram_state_pending_exact(void *opaque,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
RAMState **temp = opaque;
RAMState *rs = *temp;
uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
if (!migration_in_postcopy()) {
qemu_mutex_lock_iothread();
WITH_RCU_READ_LOCK_GUARD() {
migration_bitmap_sync_precopy(rs);
@ -4091,12 +4150,6 @@ int ram_load_postcopy(QEMUFile *f, int channel)
return ret;
}
static bool postcopy_is_advised(void)
{
PostcopyState ps = postcopy_state_get();
return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
}
static bool postcopy_is_running(void)
{
PostcopyState ps = postcopy_state_get();
@ -4167,7 +4220,7 @@ static int ram_load_precopy(QEMUFile *f)
MigrationIncomingState *mis = migration_incoming_get_current();
int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
/* ADVISE is earlier, it shows the source has the postcopy capability on */
bool postcopy_advised = postcopy_is_advised();
bool postcopy_advised = migration_incoming_postcopy_advised();
if (!migrate_use_compression()) {
invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
}
@ -4560,7 +4613,8 @@ static SaveVMHandlers savevm_ram_handlers = {
.save_live_complete_postcopy = ram_save_complete,
.save_live_complete_precopy = ram_save_complete,
.has_postcopy = ram_has_postcopy,
.save_live_pending = ram_save_pending,
.state_pending_exact = ram_state_pending_exact,
.state_pending_estimate = ram_state_pending_estimate,
.load_state = ram_load,
.save_cleanup = ram_save_cleanup,
.load_setup = ram_load_setup,

View File

@ -2785,7 +2785,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
rdma = qatomic_rcu_read(&rioc->rdmaout);
if (!rdma) {
return -EIO;
error_setg(errp, "RDMA control channel output is not set");
return -1;
}
CHECK_ERROR_STATE();
@ -2797,7 +2798,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
ret = qemu_rdma_write_flush(f, rdma);
if (ret < 0) {
rdma->error_state = ret;
return ret;
error_setg(errp, "qemu_rdma_write_flush returned %d", ret);
return -1;
}
for (i = 0; i < niov; i++) {
@ -2816,7 +2818,8 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
if (ret < 0) {
rdma->error_state = ret;
return ret;
error_setg(errp, "qemu_rdma_exchange_send returned %d", ret);
return -1;
}
data += len;
@ -2854,6 +2857,7 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
int flags,
Error **errp)
{
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
@ -2867,7 +2871,8 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
rdma = qatomic_rcu_read(&rioc->rdmain);
if (!rdma) {
return -EIO;
error_setg(errp, "RDMA control channel input is not set");
return -1;
}
CHECK_ERROR_STATE();
@ -2903,7 +2908,8 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
if (ret < 0) {
rdma->error_state = ret;
return ret;
error_setg(errp, "qemu_rdma_exchange_recv returned %d", ret);
return -1;
}
/*

View File

@ -42,7 +42,6 @@
#include "postcopy-ram.h"
#include "qapi/error.h"
#include "qapi/qapi-commands-migration.h"
#include "qapi/qmp/json-writer.h"
#include "qapi/clone-visitor.h"
#include "qapi/qapi-builtin-visit.h"
#include "qapi/qmp/qerror.h"
@ -67,6 +66,7 @@
#include "net/announce.h"
#include "qemu/yank.h"
#include "yank_functions.h"
#include "sysemu/qtest.h"
const unsigned int postcopy_ram_discard_version;
@ -586,6 +586,7 @@ static void dump_vmstate_vmsd(FILE *out_file,
field++;
first = false;
}
assert(field->flags == VMS_END);
fprintf(out_file, "\n%*s]", indent, "");
}
if (vmsd->subsections != NULL) {
@ -804,6 +805,42 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
}
}
/*
* Perform some basic checks on vmsd's at registration
* time.
*/
static void vmstate_check(const VMStateDescription *vmsd)
{
const VMStateField *field = vmsd->fields;
const VMStateDescription **subsection = vmsd->subsections;
if (field) {
while (field->name) {
if (field->flags & (VMS_STRUCT | VMS_VSTRUCT)) {
/* Recurse to sub structures */
vmstate_check(field->vmsd);
}
/* Carry on */
field++;
}
/* Check for the end of field list canary */
if (field->flags != VMS_END) {
error_report("VMSTATE not ending with VMS_END: %s", vmsd->name);
g_assert_not_reached();
}
}
while (subsection && *subsection) {
/*
* The name of a subsection should start with the name of the
* current object.
*/
assert(!strncmp(vmsd->name, (*subsection)->name, strlen(vmsd->name)));
vmstate_check(*subsection);
subsection++;
}
}
int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
const VMStateDescription *vmsd,
void *opaque, int alias_id,
@ -849,6 +886,11 @@ int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
} else {
se->instance_id = instance_id;
}
/* Perform a recursive sanity check during the test runs */
if (qtest_enabled()) {
vmstate_check(vmsd);
}
assert(!se->compat || se->instance_id == 0);
savevm_state_handler_insert(se);
return 0;
@ -898,17 +940,6 @@ static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
}
}
static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
JSONWriter *vmdesc)
{
trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
if (!se->vmsd) {
vmstate_save_old_style(f, se, vmdesc);
return 0;
}
return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
}
/*
* Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
*/
@ -942,6 +973,43 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
}
}
static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc)
{
int ret;
if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
return 0;
}
if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
trace_savevm_section_skip(se->idstr, se->section_id);
return 0;
}
trace_savevm_section_start(se->idstr, se->section_id);
save_section_header(f, se, QEMU_VM_SECTION_FULL);
if (vmdesc) {
json_writer_start_object(vmdesc, NULL);
json_writer_str(vmdesc, "name", se->idstr);
json_writer_int64(vmdesc, "instance_id", se->instance_id);
}
trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
if (!se->vmsd) {
vmstate_save_old_style(f, se, vmdesc);
} else {
ret = vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
if (ret) {
return ret;
}
}
trace_savevm_section_end(se->idstr, se->section_id, 0);
save_section_footer(f, se);
if (vmdesc) {
json_writer_end_object(vmdesc);
}
return 0;
}
/**
* qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
* command and associated data.
@ -1164,12 +1232,27 @@ bool qemu_savevm_state_guest_unplug_pending(void)
void qemu_savevm_state_setup(QEMUFile *f)
{
MigrationState *ms = migrate_get_current();
SaveStateEntry *se;
Error *local_err = NULL;
int ret;
ms->vmdesc = json_writer_new(false);
json_writer_start_object(ms->vmdesc, NULL);
json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size());
json_writer_start_array(ms->vmdesc, "devices");
trace_savevm_state_setup();
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (se->vmsd && se->vmsd->early_setup) {
ret = vmstate_save(f, se, ms->vmdesc);
if (ret) {
qemu_file_set_error(f, ret);
break;
}
continue;
}
if (!se->ops || !se->ops->save_setup) {
continue;
}
@ -1365,41 +1448,23 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy,
bool inactivate_disks)
{
g_autoptr(JSONWriter) vmdesc = NULL;
MigrationState *ms = migrate_get_current();
JSONWriter *vmdesc = ms->vmdesc;
int vmdesc_len;
SaveStateEntry *se;
int ret;
vmdesc = json_writer_new(false);
json_writer_start_object(vmdesc, NULL);
json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
json_writer_start_array(vmdesc, "devices");
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
continue;
}
if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
trace_savevm_section_skip(se->idstr, se->section_id);
if (se->vmsd && se->vmsd->early_setup) {
/* Already saved during qemu_savevm_state_setup(). */
continue;
}
trace_savevm_section_start(se->idstr, se->section_id);
json_writer_start_object(vmdesc, NULL);
json_writer_str(vmdesc, "name", se->idstr);
json_writer_int64(vmdesc, "instance_id", se->instance_id);
save_section_header(f, se, QEMU_VM_SECTION_FULL);
ret = vmstate_save(f, se, vmdesc);
if (ret) {
qemu_file_set_error(f, ret);
return ret;
}
trace_savevm_section_end(se->idstr, se->section_id, 0);
save_section_footer(f, se);
json_writer_end_object(vmdesc);
}
if (inactivate_disks) {
@ -1428,6 +1493,10 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
}
/* Free it now to detect any inconsistencies. */
json_writer_free(vmdesc);
ms->vmdesc = NULL;
return 0;
}
@ -1472,10 +1541,9 @@ flush:
* the result is split into the amount for units that can and
* for units that can't do postcopy.
*/
void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
SaveStateEntry *se;
@ -1483,9 +1551,8 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
*res_compatible = 0;
*res_postcopy_only = 0;
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->save_live_pending) {
if (!se->ops || !se->ops->state_pending_exact) {
continue;
}
if (se->ops->is_active) {
@ -1493,9 +1560,34 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
continue;
}
}
se->ops->save_live_pending(f, se->opaque, threshold_size,
res_precopy_only, res_compatible,
res_postcopy_only);
se->ops->state_pending_exact(se->opaque,
res_precopy_only, res_compatible,
res_postcopy_only);
}
}
void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
{
SaveStateEntry *se;
*res_precopy_only = 0;
*res_compatible = 0;
*res_postcopy_only = 0;
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->state_pending_estimate) {
continue;
}
if (se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}
se->ops->state_pending_estimate(se->opaque,
res_precopy_only, res_compatible,
res_postcopy_only);
}
}
@ -1595,21 +1687,10 @@ int qemu_save_device_state(QEMUFile *f)
if (se->is_ram) {
continue;
}
if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
continue;
}
if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
continue;
}
save_section_header(f, se, QEMU_VM_SECTION_FULL);
ret = vmstate_save(f, se, NULL);
if (ret) {
return ret;
}
save_section_footer(f, se);
}
qemu_put_byte(f, QEMU_VM_EOF);

View File

@ -40,10 +40,12 @@ void qemu_savevm_state_cleanup(void);
void qemu_savevm_state_complete_postcopy(QEMUFile *f);
int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
bool inactivate_disks);
void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only);
void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only);
void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only);
void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
void qemu_savevm_send_open_return_path(QEMUFile *f);
int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);

51
migration/threadinfo.c Normal file
View File

@ -0,0 +1,51 @@
/*
* Migration Threads info
*
* Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD.
*
* Authors:
* Jiang Jiacheng <jiangjiacheng@huawei.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "threadinfo.h"
static QLIST_HEAD(, MigrationThread) migration_threads;
MigrationThread *MigrationThreadAdd(const char *name, int thread_id)
{
MigrationThread *thread = g_new0(MigrationThread, 1);
thread->name = name;
thread->thread_id = thread_id;
QLIST_INSERT_HEAD(&migration_threads, thread, node);
return thread;
}
void MigrationThreadDel(MigrationThread *thread)
{
if (thread) {
QLIST_REMOVE(thread, node);
g_free(thread);
}
}
MigrationThreadInfoList *qmp_query_migrationthreads(Error **errp)
{
MigrationThreadInfoList *head = NULL;
MigrationThreadInfoList **tail = &head;
MigrationThread *thread = NULL;
QLIST_FOREACH(thread, &migration_threads, node) {
MigrationThreadInfo *info = g_new0(MigrationThreadInfo, 1);
info->name = g_strdup(thread->name);
info->thread_id = thread->thread_id;
QAPI_LIST_APPEND(tail, info);
}
return head;
}

28
migration/threadinfo.h Normal file
View File

@ -0,0 +1,28 @@
/*
* Migration Threads info
*
* Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD.
*
* Authors:
* Jiang Jiacheng <jiangjiacheng@huawei.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/queue.h"
#include "qemu/osdep.h"
#include "qapi/error.h"
#include "qapi/qapi-commands-migration.h"
typedef struct MigrationThread MigrationThread;
struct MigrationThread {
const char *name; /* the name of migration thread */
int thread_id; /* ID of the underlying host thread */
QLIST_ENTRY(MigrationThread) node;
};
MigrationThread *MigrationThreadAdd(const char *name, int thread_id);
void MigrationThreadDel(MigrationThread *info);

View File

@ -150,7 +150,8 @@ migrate_fd_cleanup(void) ""
migrate_fd_error(const char *error_desc) "error=%s"
migrate_fd_cancel(void) ""
migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64
migration_completion_file_err(void) ""
@ -330,7 +331,7 @@ send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uin
dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
dirty_bitmap_save_complete_enter(void) ""
dirty_bitmap_save_complete_finish(void) ""
dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64
dirty_bitmap_state_pending(uint64_t pending) "pending %" PRIu64
dirty_bitmap_load_complete(void) ""
dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
dirty_bitmap_load_bits_zeroes(void) ""
@ -355,7 +356,7 @@ migration_block_save_device_dirty(int64_t sector) "Error reading sector %" PRId6
migration_block_flush_blks(const char *action, int submitted, int read_done, int transferred) "%s submitted %d read_done %d transferred %d"
migration_block_save(const char *mig_stage, int submitted, int transferred) "Enter save live %s submitted %d transferred %d"
migration_block_save_complete(void) "Block migration completed"
migration_block_save_pending(uint64_t pending) "Enter save live pending %" PRIu64
migration_block_state_pending(uint64_t pending) "Enter save live pending %" PRIu64
# page_cache.c
migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64

View File

@ -154,6 +154,7 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
}
field++;
}
assert(field->flags == VMS_END);
ret = vmstate_subsection_load(f, vmsd, opaque);
if (ret != 0) {
return ret;
@ -408,6 +409,7 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
}
field++;
}
assert(field->flags == VMS_END);
if (vmdesc) {
json_writer_end_array(vmdesc);

View File

@ -1958,6 +1958,35 @@
{ 'command': 'query-vcpu-dirty-limit',
'returns': [ 'DirtyLimitInfo' ] }
##
# @MigrationThreadInfo:
#
# Information about migrationthreads
#
# @name: the name of migration thread
#
# @thread-id: ID of the underlying host thread
#
# Since: 7.2
##
{ 'struct': 'MigrationThreadInfo',
'data': {'name': 'str',
'thread-id': 'int'} }
##
# @query-migrationthreads:
#
# Returns information of migration threads
#
# data: migration thread name
#
# returns: information about migration threads
#
# Since: 7.2
##
{ 'command': 'query-migrationthreads',
'returns': ['MigrationThreadInfo'] }
##
# @snapshot-save:
#

View File

@ -614,7 +614,7 @@ static int coroutine_fn prh_read(PRHelperClient *client, void *buf, int sz,
iov.iov_base = buf;
iov.iov_len = sz;
n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1,
&fds, &nfds, errp);
&fds, &nfds, 0, errp);
if (n_read == QIO_CHANNEL_ERR_BLOCK) {
qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN);

View File

@ -61,14 +61,14 @@ static bool uffd_feature_thread_id;
#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include <linux/userfaultfd.h>
#include "qemu/userfaultfd.h"
static bool ufd_version_check(void)
{
struct uffdio_api api_struct;
uint64_t ioctl_mask;
int ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
int ufd = uffd_open(O_CLOEXEC);
if (ufd == -1) {
g_test_message("Skipping test: userfaultfd not available");

View File

@ -115,7 +115,7 @@ void *tpm_emu_ctrl_thread(void *data)
int *pfd = NULL;
size_t nfd = 0;
qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, &error_abort);
qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, 0, &error_abort);
cmd = be32_to_cpu(cmd);
g_assert_cmpint(cmd, ==, CMD_SET_DATAFD);
g_assert_cmpint(nfd, ==, 1);

View File

@ -460,6 +460,7 @@ static void test_io_channel_unix_fd_pass(void)
G_N_ELEMENTS(iorecv),
&fdrecv,
&nfdrecv,
0,
&error_abort);
g_assert(nfdrecv == G_N_ELEMENTS(fdsend));

View File

@ -19,6 +19,15 @@
#include <sys/syscall.h>
#include <sys/ioctl.h>
int uffd_open(int flags)
{
#if defined(__NR_userfaultfd)
return syscall(__NR_userfaultfd, flags);
#else
return -EINVAL;
#endif
}
/**
* uffd_query_features: query UFFD features
*
@ -32,7 +41,7 @@ int uffd_query_features(uint64_t *features)
struct uffdio_api api_struct = { 0 };
int ret = -1;
uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
uffd_fd = uffd_open(O_CLOEXEC);
if (uffd_fd < 0) {
trace_uffd_query_features_nosys(errno);
return -1;
@ -69,7 +78,7 @@ int uffd_create_fd(uint64_t features, bool non_blocking)
uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
uffd_fd = syscall(__NR_userfaultfd, flags);
uffd_fd = uffd_open(flags);
if (uffd_fd < 0) {
trace_uffd_create_fd_nosys(errno);
return -1;

View File

@ -116,7 +116,7 @@ vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
* qio_channel_readv_full may have short reads, keeping calling it
* until getting VHOST_USER_HDR_SIZE or 0 bytes in total
*/
rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, 0, &local_err);
if (rc < 0) {
if (rc == QIO_CHANNEL_ERR_BLOCK) {
assert(local_err == NULL);