diff --git a/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch b/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch new file mode 100644 index 0000000..c7f2ccb --- /dev/null +++ b/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch @@ -0,0 +1,117 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Mon, 29 Apr 2024 14:43:58 +0200 +Subject: [PATCH] PVE backup: improve error when copy-before-write fails for + fleecing + +With fleecing, failure for copy-before-write does not fail the guest +write, but only sets the snapshot error that is associated to the +copy-before-write filter, making further requests to the snapshot +access fail with EACCES, which then also fails the job. But that error +code is not the root cause of why the backup failed, so bubble up the +original snapshot error instead. + +Reported-by: Friedrich Weber +Signed-off-by: Fiona Ebner +Tested-by: Friedrich Weber +--- + block/copy-before-write.c | 18 ++++++++++++------ + block/copy-before-write.h | 1 + + pve-backup.c | 9 +++++++++ + 3 files changed, 22 insertions(+), 6 deletions(-) + +diff --git a/block/copy-before-write.c b/block/copy-before-write.c +index bba58326d7..50cc4c7aae 100644 +--- a/block/copy-before-write.c ++++ b/block/copy-before-write.c +@@ -27,6 +27,7 @@ + #include "qapi/qmp/qjson.h" + + #include "sysemu/block-backend.h" ++#include "qemu/atomic.h" + #include "qemu/cutils.h" + #include "qapi/error.h" + #include "block/block_int.h" +@@ -74,7 +75,8 @@ typedef struct BDRVCopyBeforeWriteState { + * @snapshot_error is normally zero. But on first copy-before-write failure + * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes + * value of this error (<0). After that all in-flight and further +- * snapshot-API requests will fail with that error. ++ * snapshot-API requests will fail with that error. To be accessed with ++ * atomics. + */ + int snapshot_error; + } BDRVCopyBeforeWriteState; +@@ -114,7 +116,7 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, + return 0; + } + +- if (s->snapshot_error) { ++ if (qatomic_read(&s->snapshot_error)) { + return 0; + } + +@@ -138,9 +140,7 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, + WITH_QEMU_LOCK_GUARD(&s->lock) { + if (ret < 0) { + assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT); +- if (!s->snapshot_error) { +- s->snapshot_error = ret; +- } ++ qatomic_cmpxchg(&s->snapshot_error, 0, ret); + } else { + bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off); + } +@@ -214,7 +214,7 @@ cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes, + + QEMU_LOCK_GUARD(&s->lock); + +- if (s->snapshot_error) { ++ if (qatomic_read(&s->snapshot_error)) { + g_free(req); + return NULL; + } +@@ -585,6 +585,12 @@ void bdrv_cbw_drop(BlockDriverState *bs) + bdrv_unref(bs); + } + ++int bdrv_cbw_snapshot_error(BlockDriverState *bs) ++{ ++ BDRVCopyBeforeWriteState *s = bs->opaque; ++ return qatomic_read(&s->snapshot_error); ++} ++ + static void cbw_init(void) + { + bdrv_register(&bdrv_cbw_filter); +diff --git a/block/copy-before-write.h b/block/copy-before-write.h +index dc6cafe7fa..a27d2d7d9f 100644 +--- a/block/copy-before-write.h ++++ b/block/copy-before-write.h +@@ -44,5 +44,6 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source, + BlockCopyState **bcs, + Error **errp); + void bdrv_cbw_drop(BlockDriverState *bs); ++int bdrv_cbw_snapshot_error(BlockDriverState *bs); + + #endif /* COPY_BEFORE_WRITE_H */ +diff --git a/pve-backup.c b/pve-backup.c +index 7cc1dd3724..07709aa350 100644 +--- a/pve-backup.c ++++ b/pve-backup.c +@@ -379,6 +379,15 @@ static void pvebackup_complete_cb(void *opaque, int ret) + di->fleecing.snapshot_access = NULL; + } + if (di->fleecing.cbw) { ++ /* ++ * With fleecing, failure for cbw does not fail the guest write, but only sets the snapshot ++ * error, making further requests to the snapshot fail with EACCES, which then also fail the ++ * job. But that code is not the root cause and just confusing, so update it. ++ */ ++ int snapshot_error = bdrv_cbw_snapshot_error(di->fleecing.cbw); ++ if (di->completed_ret == -EACCES && snapshot_error) { ++ di->completed_ret = snapshot_error; ++ } + bdrv_cbw_drop(di->fleecing.cbw); + di->fleecing.cbw = NULL; + } diff --git a/debian/patches/series b/debian/patches/series index 47e8307..b97881e 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -59,3 +59,4 @@ pve/0047-qapi-blockdev-backup-add-discard-source-parameter.patch pve/0048-copy-before-write-allow-specifying-minimum-cluster-s.patch pve/0049-backup-add-minimum-cluster-size-to-performance-optio.patch pve/0050-PVE-backup-add-fleecing-option.patch +pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch