fix vmstate-snapshots with iothread=1

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2020-05-27 14:40:46 +02:00 · 2020-05-27 14:40:46 +02:00 · f063a8aadb
parent 63be960208
commit f063a8aadb
5 changed files with 416 additions and 0 deletions
--- a/debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch
+++ b/debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch
@ -0,0 +1,188 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Wed, 27 May 2020 11:33:19 +0200
+Subject: [PATCH] savevm-async: move more code to cleanup and rename to
+ finalize
+
+process_savevm_cleanup is renamed to process_savevm_finalize to
+accomodate more code that is not all cleanup related.
+
+The benefit of this is that it allows us to call functions which need to
+run in the main AIOContext directly. It doesn't majorly affect snapshot
+performance, since the first instruction that is moved stops the VM,
+so the downtime stays about the same.
+
+The target bdrv is additionally moved to the IOHandler context before
+process_savevm_co to make sure the coroutine can call functions that
+require it to own the bdrv's context. process_savevm_finalize then moves
+it back to the main context to run its part.
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+---
+ savevm-async.c | 87 +++++++++++++++++++++++++++++---------------------
+ 1 file changed, 51 insertions(+), 36 deletions(-)
+
+diff --git a/savevm-async.c b/savevm-async.c
+index c3fe741c38..2894c94233 100644
+--- a/savevm-async.c
+++ b/savevm-async.c
+@@ -50,7 +50,7 @@ static struct SnapshotState {
+     int saved_vm_running;
+     QEMUFile *file;
+     int64_t total_time;
+-    QEMUBH *cleanup_bh;
+    QEMUBH *finalize_bh;
+     Coroutine *co;
+ } snap_state;
+ 
+@@ -196,12 +196,42 @@ static const QEMUFileOps block_file_ops = {
+     .close =          block_state_close,
+ };
+ 
+-static void process_savevm_cleanup(void *opaque)
+static void process_savevm_finalize(void *opaque)
+ {
+     int ret;
+-    qemu_bh_delete(snap_state.cleanup_bh);
+-    snap_state.cleanup_bh = NULL;
+    AioContext *iohandler_ctx = iohandler_get_aio_context();
+    MigrationState *ms = migrate_get_current();
+
+    qemu_bh_delete(snap_state.finalize_bh);
+    snap_state.finalize_bh = NULL;
+     snap_state.co = NULL;
+
+    /* We need to own the target bdrv's context for the following functions,
+     * so move it back. It can stay in the main context and live out its live
+     * there, since we're done with it after this method ends anyway.
+     */
+    aio_context_acquire(iohandler_ctx);
+    blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL);
+    aio_context_release(iohandler_ctx);
+
+    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+    if (ret < 0) {
+        save_snapshot_error("vm_stop_force_state error %d", ret);
+    }
+
+    (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false);
+    ret = qemu_file_get_error(snap_state.file);
+    if (ret < 0) {
+            save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
+    }
+
+    DPRINTF("state saving complete\n");
+
+    /* clear migration state */
+    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
+                      ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED);
+    ms->to_dst_file = NULL;
+
+     qemu_savevm_state_cleanup();
+ 
+     ret = save_snapshot_cleanup();
+@@ -219,16 +249,15 @@ static void process_savevm_cleanup(void *opaque)
+     }
+ }
+ 
+-static void process_savevm_coro(void *opaque)
+static void coroutine_fn process_savevm_co(void *opaque)
+ {
+     int ret;
+     int64_t maxlen;
+-    MigrationState *ms = migrate_get_current();
+ 
+     ret = qemu_file_get_error(snap_state.file);
+     if (ret < 0) {
+         save_snapshot_error("qemu_savevm_state_setup failed");
+-        goto out;
+        return;
+     }
+ 
+     while (snap_state.state == SAVE_STATE_ACTIVE) {
+@@ -245,7 +274,7 @@ static void process_savevm_coro(void *opaque)
+                 save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
+                 break;
+             }
+-            DPRINTF("savevm inerate pending size %lu ret %d\n", pending_size, ret);
+            DPRINTF("savevm iterate pending size %lu ret %d\n", pending_size, ret);
+         } else {
+             qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
+             ret = global_state_store();
+@@ -253,40 +282,20 @@ static void process_savevm_coro(void *opaque)
+                 save_snapshot_error("global_state_store error %d", ret);
+                 break;
+             }
+-            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+-            if (ret < 0) {
+-                save_snapshot_error("vm_stop_force_state error %d", ret);
+-                break;
+-            }
+-            DPRINTF("savevm inerate finished\n");
+-            /* upstream made the return value here inconsistent
+-             * (-1 instead of 'ret' in one case and 0 after flush which can
+-             * still set a file error...)
+-             */
+-            (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false);
+-            ret = qemu_file_get_error(snap_state.file);
+-            if (ret < 0) {
+-                    save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
+-                    break;
+-            }
+-            DPRINTF("save complete\n");
+
+            DPRINTF("savevm iterate complete\n");
+             break;
+         }
+     }
+ 
+-    qemu_bh_schedule(snap_state.cleanup_bh);
+-
+-out:
+-    /* set migration state accordingly and clear soon-to-be stale file */
+-    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
+-                      ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED);
+-    ms->to_dst_file = NULL;
+    qemu_bh_schedule(snap_state.finalize_bh);
+ }
+ 
+ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
+ {
+     Error *local_err = NULL;
+     MigrationState *ms = migrate_get_current();
+    AioContext *iohandler_ctx = iohandler_get_aio_context();
+ 
+     int bdrv_oflags = BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH;
+ 
+@@ -347,7 +356,7 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
+ 
+     /*
+      * qemu_savevm_* paths use migration code and expect a migration state.
+-     * State is cleared in process_savevm_thread, but has to be initialized
+     * State is cleared in process_savevm_co, but has to be initialized
+      * here (blocking main thread, from QMP) to avoid race conditions.
+      */
+     migrate_init(ms);
+@@ -358,13 +367,19 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
+     blk_op_block_all(snap_state.target, snap_state.blocker);
+ 
+     snap_state.state = SAVE_STATE_ACTIVE;
+-    snap_state.cleanup_bh = qemu_bh_new(process_savevm_cleanup, &snap_state);
+-    snap_state.co = qemu_coroutine_create(&process_savevm_coro, NULL);
+    snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
+    snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
+     qemu_mutex_unlock_iothread();
+     qemu_savevm_state_header(snap_state.file);
+     qemu_savevm_state_setup(snap_state.file);
+     qemu_mutex_lock_iothread();
+-    aio_co_schedule(iohandler_get_aio_context(), snap_state.co);
+
+    /* Async processing from here on out happens in iohandler context, so let
+     * the target bdrv have its home there.
+     */
+    blk_set_aio_context(snap_state.target, iohandler_ctx, &local_err);
+
+    aio_co_schedule(iohandler_ctx, snap_state.co);
+ 
+     return;
+ 
--- a/debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch
+++ b/debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch
@ -0,0 +1,86 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Kevin Wolf <kwolf@redhat.com>
+Date: Wed, 27 May 2020 11:33:20 +0200
+Subject: [PATCH] util/async: Add aio_co_reschedule_self()
+
+From: Kevin Wolf <kwolf@redhat.com>
+
+Add a function that can be used to move the currently running coroutine
+to a different AioContext (and therefore potentially a different
+thread).
+
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ include/block/aio.h | 10 ++++++++++
+ util/async.c        | 30 ++++++++++++++++++++++++++++++
+ 2 files changed, 40 insertions(+)
+
+diff --git a/include/block/aio.h b/include/block/aio.h
+index 62ed954344..d5399c67d6 100644
+--- a/include/block/aio.h
+++ b/include/block/aio.h
+@@ -17,6 +17,7 @@
+ #ifdef CONFIG_LINUX_IO_URING
+ #include <liburing.h>
+ #endif
+#include "qemu/coroutine.h"
+ #include "qemu/queue.h"
+ #include "qemu/event_notifier.h"
+ #include "qemu/thread.h"
+@@ -654,6 +655,15 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
+  */
+ void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+ 
+/**
+ * aio_co_reschedule_self:
+ * @new_ctx: the new context
+ *
+ * Move the currently running coroutine to new_ctx. If the coroutine is already
+ * running in new_ctx, do nothing.
+ */
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
+
+ /**
+  * aio_co_wake:
+  * @co: the coroutine
+diff --git a/util/async.c b/util/async.c
+index 3165a28f2f..4eba1e6f1b 100644
+--- a/util/async.c
+++ b/util/async.c
+@@ -558,6 +558,36 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co)
+     aio_context_unref(ctx);
+ }
+ 
+typedef struct AioCoRescheduleSelf {
+    Coroutine *co;
+    AioContext *new_ctx;
+} AioCoRescheduleSelf;
+
+static void aio_co_reschedule_self_bh(void *opaque)
+{
+    AioCoRescheduleSelf *data = opaque;
+    aio_co_schedule(data->new_ctx, data->co);
+}
+
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx)
+{
+    AioContext *old_ctx = qemu_get_current_aio_context();
+
+    if (old_ctx != new_ctx) {
+        AioCoRescheduleSelf data = {
+            .co = qemu_coroutine_self(),
+            .new_ctx = new_ctx,
+        };
+        /*
+         * We can't directly schedule the coroutine in the target context
+         * because this would be racy: The other thread could try to enter the
+         * coroutine before it has yielded in this one.
+         */
+        aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data);
+        qemu_coroutine_yield();
+    }
+}
+
+ void aio_co_wake(struct Coroutine *co)
+ {
+     AioContext *ctx;
--- a/debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch
+++ b/debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch
@ -0,0 +1,58 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Wed, 27 May 2020 11:33:21 +0200
+Subject: [PATCH] savevm-async: flush IOThread-drives async before entering
+ blocking part
+
+By flushing all drives where its possible to so before entering the
+blocking part (where the VM is stopped), we can reduce the time spent in
+said part for every disk that has an IOThread (other drives cannot be
+flushed async anyway).
+
+Suggested-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+---
+ savevm-async.c | 23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
+diff --git a/savevm-async.c b/savevm-async.c
+index 2894c94233..4ce83a0691 100644
+--- a/savevm-async.c
+++ b/savevm-async.c
+@@ -253,6 +253,8 @@ static void coroutine_fn process_savevm_co(void *opaque)
+ {
+     int ret;
+     int64_t maxlen;
+    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;
+ 
+     ret = qemu_file_get_error(snap_state.file);
+     if (ret < 0) {
+@@ -288,6 +290,27 @@ static void coroutine_fn process_savevm_co(void *opaque)
+         }
+     }
+ 
+    /* If a drive runs in an IOThread we can flush it async, and only
+     * need to sync-flush whatever IO happens between now and
+     * vm_stop_force_state. bdrv_next can only be called from main AioContext,
+     * so move there now and after every flush.
+     */
+    aio_co_reschedule_self(qemu_get_aio_context());
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */
+        if (bs == blk_bs(snap_state.target)) {
+            continue;
+        }
+
+        AioContext *bs_ctx = bdrv_get_aio_context(bs);
+        if (bs_ctx != qemu_get_aio_context()) {
+            DPRINTF("savevm: async flushing drive %s\n", bs->filename);
+            aio_co_reschedule_self(bs_ctx);
+            bdrv_flush(bs);
+            aio_co_reschedule_self(qemu_get_aio_context());
+        }
+    }
+
+     qemu_bh_schedule(snap_state.finalize_bh);
+ }
+ 
--- a/debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch
+++ b/debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch
@ -0,0 +1,80 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Wed, 27 May 2020 11:33:22 +0200
+Subject: [PATCH] savevm-async: add debug timing prints
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+[ Thomas: guard variable declaration by DEBUG #ifdef ]
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ savevm-async.c | 22 ++++++++++++++++++++++
+ 1 file changed, 22 insertions(+)
+
+diff --git a/savevm-async.c b/savevm-async.c
+index 4ce83a0691..0388cebbe9 100644
+--- a/savevm-async.c
+++ b/savevm-async.c
+@@ -202,6 +202,10 @@ static void process_savevm_finalize(void *opaque)
+     AioContext *iohandler_ctx = iohandler_get_aio_context();
+     MigrationState *ms = migrate_get_current();
+ 
+#ifdef DEBUG_SAVEVM_STATE
+    int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+#endif
+
+     qemu_bh_delete(snap_state.finalize_bh);
+     snap_state.finalize_bh = NULL;
+     snap_state.co = NULL;
+@@ -226,6 +230,8 @@ static void process_savevm_finalize(void *opaque)
+     }
+ 
+     DPRINTF("state saving complete\n");
+    DPRINTF("timing: process_savevm_finalize (state saving) took %ld ms\n",
+        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
+ 
+     /* clear migration state */
+     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
+@@ -247,6 +253,9 @@ static void process_savevm_finalize(void *opaque)
+         vm_start();
+         snap_state.saved_vm_running = false;
+     }
+
+    DPRINTF("timing: process_savevm_finalize (full) took %ld ms\n",
+        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
+ }
+ 
+ static void coroutine_fn process_savevm_co(void *opaque)
+@@ -256,6 +265,10 @@ static void coroutine_fn process_savevm_co(void *opaque)
+     BdrvNextIterator it;
+     BlockDriverState *bs = NULL;
+ 
+#ifdef DEBUG_SAVEVM_STATE
+    int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+#endif
+
+     ret = qemu_file_get_error(snap_state.file);
+     if (ret < 0) {
+         save_snapshot_error("qemu_savevm_state_setup failed");
+@@ -290,6 +303,12 @@ static void coroutine_fn process_savevm_co(void *opaque)
+         }
+     }
+ 
+    DPRINTF("timing: process_savevm_co took %ld ms\n",
+        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
+
+#ifdef DEBUG_SAVEVM_STATE
+    int64_t start_time_flush = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+#endif
+     /* If a drive runs in an IOThread we can flush it async, and only
+      * need to sync-flush whatever IO happens between now and
+      * vm_stop_force_state. bdrv_next can only be called from main AioContext,
+@@ -311,6 +330,9 @@ static void coroutine_fn process_savevm_co(void *opaque)
+         }
+     }
+ 
+    DPRINTF("timing: async flushing took %ld ms\n",
+        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time_flush);
+
+     qemu_bh_schedule(snap_state.finalize_bh);
+ }
+ 
--- a/debian/patches/series
+++ b/debian/patches/series
@ -42,3 +42,7 @@ pve/0041-PVE-Backup-avoid-use-QemuRecMutex-inside-coroutines.patch
 pve/0042-PVE-Backup-use-QemuMutex-instead-of-QemuRecMutex.patch
 pve/0043-move-savevm-async-back-into-a-coroutine.patch
 pve/0044-add-optional-buffer-size-to-QEMUFile.patch
+pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch
+pve/0046-util-async-Add-aio_co_reschedule_self.patch
+pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch
+pve/0048-savevm-async-add-debug-timing-prints.patch