From 677d0d169f7c4209ec2fbd89a9e9e02aa6bd9219 Mon Sep 17 00:00:00 2001 From: Stefan Reiter Date: Mon, 15 Mar 2021 16:41:24 +0100 Subject: [PATCH] add alloc-track block driver patch See added patches for more info, overview: 0044: slightly increase PBS performance by reducing allocations 0045: slightly increase block-stream performance for Ceph 0046: don't crash with block-stream on RBD 0047: add alloc-track driver for live restore Signed-off-by: Stefan Reiter --- ...st-path-reads-without-allocation-if-.patch | 52 +++ ...PVE-block-stream-increase-chunk-size.patch | 23 ++ ...accept-NULL-qiov-in-bdrv_pad_request.patch | 42 ++ .../0047-block-add-alloc-track-driver.patch | 391 ++++++++++++++++++ debian/patches/series | 4 + 5 files changed, 512 insertions(+) create mode 100644 debian/patches/pve/0044-PVE-block-pbs-fast-path-reads-without-allocation-if-.patch create mode 100644 debian/patches/pve/0045-PVE-block-stream-increase-chunk-size.patch create mode 100644 debian/patches/pve/0046-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch create mode 100644 debian/patches/pve/0047-block-add-alloc-track-driver.patch diff --git a/debian/patches/pve/0044-PVE-block-pbs-fast-path-reads-without-allocation-if-.patch b/debian/patches/pve/0044-PVE-block-pbs-fast-path-reads-without-allocation-if-.patch new file mode 100644 index 0000000..a85ebc2 --- /dev/null +++ b/debian/patches/pve/0044-PVE-block-pbs-fast-path-reads-without-allocation-if-.patch @@ -0,0 +1,52 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Stefan Reiter +Date: Wed, 9 Dec 2020 11:46:57 +0100 +Subject: [PATCH] PVE: block/pbs: fast-path reads without allocation if + possible + +...and switch over to g_malloc/g_free while at it to align with other +QEMU code. + +Tracing shows the fast-path is taken almost all the time, though not +100% so the slow one is still necessary. + +Signed-off-by: Stefan Reiter +--- + block/pbs.c | 17 ++++++++++++++--- + 1 file changed, 14 insertions(+), 3 deletions(-) + +diff --git a/block/pbs.c b/block/pbs.c +index 1481a2bfd1..fbf0d8d845 100644 +--- a/block/pbs.c ++++ b/block/pbs.c +@@ -200,7 +200,16 @@ static coroutine_fn int pbs_co_preadv(BlockDriverState *bs, + BDRVPBSState *s = bs->opaque; + int ret; + char *pbs_error = NULL; +- uint8_t *buf = malloc(bytes); ++ uint8_t *buf; ++ bool inline_buf = true; ++ ++ /* for single-buffer IO vectors we can fast-path the write directly to it */ ++ if (qiov->niov == 1 && qiov->iov->iov_len >= bytes) { ++ buf = qiov->iov->iov_base; ++ } else { ++ inline_buf = false; ++ buf = g_malloc(bytes); ++ } + + ReadCallbackData rcb = { + .co = qemu_coroutine_self(), +@@ -218,8 +227,10 @@ static coroutine_fn int pbs_co_preadv(BlockDriverState *bs, + return -EIO; + } + +- qemu_iovec_from_buf(qiov, 0, buf, bytes); +- free(buf); ++ if (!inline_buf) { ++ qemu_iovec_from_buf(qiov, 0, buf, bytes); ++ g_free(buf); ++ } + + return ret; + } diff --git a/debian/patches/pve/0045-PVE-block-stream-increase-chunk-size.patch b/debian/patches/pve/0045-PVE-block-stream-increase-chunk-size.patch new file mode 100644 index 0000000..601f8c7 --- /dev/null +++ b/debian/patches/pve/0045-PVE-block-stream-increase-chunk-size.patch @@ -0,0 +1,23 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Stefan Reiter +Date: Tue, 2 Mar 2021 16:34:28 +0100 +Subject: [PATCH] PVE: block/stream: increase chunk size + +Ceph favors bigger chunks, so increase to 4M. +--- + block/stream.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/block/stream.c b/block/stream.c +index 236384f2f7..a5371420e3 100644 +--- a/block/stream.c ++++ b/block/stream.c +@@ -26,7 +26,7 @@ enum { + * large enough to process multiple clusters in a single call, so + * that populating contiguous regions of the image is efficient. + */ +- STREAM_CHUNK = 512 * 1024, /* in bytes */ ++ STREAM_CHUNK = 4 * 1024 * 1024, /* in bytes */ + }; + + typedef struct StreamBlockJob { diff --git a/debian/patches/pve/0046-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch b/debian/patches/pve/0046-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch new file mode 100644 index 0000000..e40fa2e --- /dev/null +++ b/debian/patches/pve/0046-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch @@ -0,0 +1,42 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Stefan Reiter +Date: Tue, 2 Mar 2021 16:11:54 +0100 +Subject: [PATCH] block/io: accept NULL qiov in bdrv_pad_request + +Some operations, e.g. block-stream, perform reads while discarding the +results (only copy-on-read matters). In this case they will pass NULL as +the target QEMUIOVector, which will however trip bdrv_pad_request, since +it wants to extend its passed vector. + +Simply check for NULL and do nothing, there's no reason to pad the +target if it will be discarded anyway. +--- + block/io.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/block/io.c b/block/io.c +index ec5e152bb7..08dee005ec 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -1613,13 +1613,16 @@ static bool bdrv_pad_request(BlockDriverState *bs, + return false; + } + +- qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, +- *qiov, *qiov_offset, *bytes, +- pad->buf + pad->buf_len - pad->tail, pad->tail); ++ if (*qiov) { ++ qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, ++ *qiov, *qiov_offset, *bytes, ++ pad->buf + pad->buf_len - pad->tail, pad->tail); ++ *qiov = &pad->local_qiov; ++ *qiov_offset = 0; ++ } ++ + *bytes += pad->head + pad->tail; + *offset -= pad->head; +- *qiov = &pad->local_qiov; +- *qiov_offset = 0; + + return true; + } diff --git a/debian/patches/pve/0047-block-add-alloc-track-driver.patch b/debian/patches/pve/0047-block-add-alloc-track-driver.patch new file mode 100644 index 0000000..db46371 --- /dev/null +++ b/debian/patches/pve/0047-block-add-alloc-track-driver.patch @@ -0,0 +1,391 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Stefan Reiter +Date: Mon, 7 Dec 2020 15:21:03 +0100 +Subject: [PATCH] block: add alloc-track driver + +Add a new filter node 'alloc-track', which seperates reads and writes to +different children, thus allowing to put a backing image behind any +blockdev (regardless of driver support). Since we can't detect any +pre-allocated blocks, we can only track new writes, hence the write +target ('file') for this node must always be empty. + +Intended use case is for live restoring, i.e. add a backup image as a +block device into a VM, then put an alloc-track on the restore target +and set the backup as backing. With this, one can use a regular +'block-stream' to restore the image, while the VM can already run in the +background. Copy-on-read will help make progress as the VM reads as +well. + +This only worked if the target supports backing images, so up until now +only for qcow2, with alloc-track any driver for the target can be used. + +If 'auto-remove' is set, alloc-track will automatically detach itself +once the backing image is removed. It will be replaced by 'file'. + +Signed-off-by: Stefan Reiter +--- + block/alloc-track.c | 342 ++++++++++++++++++++++++++++++++++++++++++++ + block/meson.build | 1 + + 2 files changed, 343 insertions(+) + create mode 100644 block/alloc-track.c + +diff --git a/block/alloc-track.c b/block/alloc-track.c +new file mode 100644 +index 0000000000..b579380279 +--- /dev/null ++++ b/block/alloc-track.c +@@ -0,0 +1,342 @@ ++/* ++ * Node to allow backing images to be applied to any node. Assumes a blank ++ * image to begin with, only new writes are tracked as allocated, thus this ++ * must never be put on a node that already contains data. ++ * ++ * Copyright (c) 2020 Proxmox Server Solutions GmbH ++ * Copyright (c) 2020 Stefan Reiter ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#include "qemu/osdep.h" ++#include "qapi/error.h" ++#include "block/block_int.h" ++#include "qapi/qmp/qdict.h" ++#include "qapi/qmp/qstring.h" ++#include "qemu/cutils.h" ++#include "qemu/option.h" ++#include "qemu/module.h" ++#include "sysemu/block-backend.h" ++ ++#define TRACK_OPT_AUTO_REMOVE "auto-remove" ++ ++typedef enum DropState { ++ DropNone, ++ DropRequested, ++ DropInProgress, ++} DropState; ++ ++typedef struct { ++ BdrvDirtyBitmap *bitmap; ++ DropState drop_state; ++ bool auto_remove; ++} BDRVAllocTrackState; ++ ++static QemuOptsList runtime_opts = { ++ .name = "alloc-track", ++ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), ++ .desc = { ++ { ++ .name = TRACK_OPT_AUTO_REMOVE, ++ .type = QEMU_OPT_BOOL, ++ .help = "automatically replace this node with 'file' when 'backing'" ++ "is detached", ++ }, ++ { /* end of list */ } ++ }, ++}; ++ ++static void track_refresh_limits(BlockDriverState *bs, Error **errp) ++{ ++ BlockDriverInfo bdi; ++ ++ if (!bs->file) { ++ return; ++ } ++ ++ /* always use alignment from underlying write device so RMW cycle for ++ * bdrv_pwritev reads data from our backing via track_co_preadv (no partial ++ * cluster allocation in 'file') */ ++ bdrv_get_info(bs->file->bs, &bdi); ++ bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment, ++ MAX(bdi.cluster_size, BDRV_SECTOR_SIZE)); ++} ++ ++static int track_open(BlockDriverState *bs, QDict *options, int flags, ++ Error **errp) ++{ ++ BDRVAllocTrackState *s = bs->opaque; ++ QemuOpts *opts; ++ Error *local_err = NULL; ++ int ret = 0; ++ ++ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); ++ qemu_opts_absorb_qdict(opts, options, &local_err); ++ if (local_err) { ++ error_propagate(errp, local_err); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false); ++ ++ /* open the target (write) node, backing will be attached by block layer */ ++ bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, ++ BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false, ++ &local_err); ++ if (local_err) { ++ ret = -EINVAL; ++ error_propagate(errp, local_err); ++ goto fail; ++ } ++ ++ track_refresh_limits(bs, errp); ++ uint64_t gran = bs->bl.request_alignment; ++ s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, gran, NULL, &local_err); ++ if (local_err) { ++ ret = -EIO; ++ error_propagate(errp, local_err); ++ goto fail; ++ } ++ ++ s->drop_state = DropNone; ++ ++fail: ++ if (ret < 0) { ++ bdrv_unref_child(bs, bs->file); ++ if (s->bitmap) { ++ bdrv_release_dirty_bitmap(s->bitmap); ++ } ++ } ++ qemu_opts_del(opts); ++ return ret; ++} ++ ++static void track_close(BlockDriverState *bs) ++{ ++ BDRVAllocTrackState *s = bs->opaque; ++ if (s->bitmap) { ++ bdrv_release_dirty_bitmap(s->bitmap); ++ } ++} ++ ++static int64_t track_getlength(BlockDriverState *bs) ++{ ++ return bdrv_getlength(bs->file->bs); ++} ++ ++static int coroutine_fn track_co_preadv(BlockDriverState *bs, ++ uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) ++{ ++ BDRVAllocTrackState *s = bs->opaque; ++ QEMUIOVector local_qiov; ++ int ret; ++ ++ /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */ ++ uint64_t cur_offset, local_offset; ++ int64_t local_bytes; ++ bool alloc; ++ ++ /* a read request can span multiple granularity-sized chunks, and can thus ++ * contain blocks with different allocation status - we could just iterate ++ * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X ++ * to find the next flip and consider everything up to that in one go */ ++ for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) { ++ local_offset = offset + cur_offset; ++ alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset); ++ if (alloc) { ++ local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset, ++ bytes - cur_offset); ++ } else { ++ local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset, ++ bytes - cur_offset); ++ } ++ ++ /* _bitmap_next_X return is -1 if no end found within limit, otherwise ++ * offset of next flip (to start of image) */ ++ local_bytes = local_bytes < 0 ? ++ bytes - cur_offset : ++ local_bytes - local_offset; ++ ++ qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes); ++ ++ if (alloc) { ++ ret = bdrv_co_preadv(bs->file, local_offset, local_bytes, ++ &local_qiov, flags); ++ } else if (bs->backing) { ++ ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes, ++ &local_qiov, flags); ++ } else { ++ ret = qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes); ++ } ++ ++ if (ret != 0) { ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int coroutine_fn track_co_pwritev(BlockDriverState *bs, ++ uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) ++{ ++ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); ++} ++ ++static int coroutine_fn track_co_pwrite_zeroes(BlockDriverState *bs, ++ int64_t offset, int count, BdrvRequestFlags flags) ++{ ++ return bdrv_pwrite_zeroes(bs->file, offset, count, flags); ++} ++ ++static int coroutine_fn track_co_pdiscard(BlockDriverState *bs, ++ int64_t offset, int count) ++{ ++ return bdrv_co_pdiscard(bs->file, offset, count); ++} ++ ++static coroutine_fn int track_co_flush(BlockDriverState *bs) ++{ ++ return bdrv_co_flush(bs->file->bs); ++} ++ ++static int coroutine_fn track_co_block_status(BlockDriverState *bs, ++ bool want_zero, ++ int64_t offset, ++ int64_t bytes, ++ int64_t *pnum, ++ int64_t *map, ++ BlockDriverState **file) ++{ ++ BDRVAllocTrackState *s = bs->opaque; ++ ++ bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset); ++ int64_t next_flipped; ++ if (alloc) { ++ next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes); ++ } else { ++ next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes); ++ } ++ ++ /* in case not the entire region has the same state, we need to set pnum to ++ * indicate for how many bytes our result is valid */ ++ *pnum = next_flipped == -1 ? bytes : next_flipped - offset; ++ *map = offset; ++ ++ if (alloc) { ++ *file = bs->file->bs; ++ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; ++ } else if (bs->backing) { ++ *file = bs->backing->bs; ++ } ++ return 0; ++} ++ ++static void track_child_perm(BlockDriverState *bs, BdrvChild *c, ++ BdrvChildRole role, BlockReopenQueue *reopen_queue, ++ uint64_t perm, uint64_t shared, ++ uint64_t *nperm, uint64_t *nshared) ++{ ++ BDRVAllocTrackState *s = bs->opaque; ++ ++ *nshared = BLK_PERM_ALL; ++ ++ /* in case we're currently dropping ourselves, claim to not use any ++ * permissions at all - which is fine, since from this point on we will ++ * never issue a read or write anymore */ ++ if (s->drop_state == DropInProgress) { ++ *nperm = 0; ++ return; ++ } ++ ++ if (role & BDRV_CHILD_DATA) { ++ *nperm = perm & DEFAULT_PERM_PASSTHROUGH; ++ } else { ++ /* 'backing' is also a child of our BDS, but we don't expect it to be ++ * writeable, so we only forward 'consistent read' */ ++ *nperm = perm & BLK_PERM_CONSISTENT_READ; ++ } ++} ++ ++static void track_drop(void *opaque) ++{ ++ BlockDriverState *bs = (BlockDriverState*)opaque; ++ BlockDriverState *file = bs->file->bs; ++ BDRVAllocTrackState *s = bs->opaque; ++ ++ assert(file); ++ ++ /* we rely on the fact that we're not used anywhere else, so let's wait ++ * until we're only used once - in the drive connected to the guest (and one ++ * ref is held by bdrv_ref in track_change_backing_file) */ ++ if (bs->refcnt > 2) { ++ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque); ++ return; ++ } ++ ++ /* we do not need a bdrv_drained_end, since this is applied only to the node ++ * which gets removed by bdrv_replace_node */ ++ bdrv_drained_begin(bs); ++ ++ /* now that we're drained, we can safely set 'DropInProgress' */ ++ s->drop_state = DropInProgress; ++ bdrv_child_refresh_perms(bs, bs->file, &error_abort); ++ ++ bdrv_replace_node(bs, file, &error_abort); ++ bdrv_unref(bs); ++} ++ ++static int track_change_backing_file(BlockDriverState *bs, ++ const char *backing_file, ++ const char *backing_fmt) ++{ ++ BDRVAllocTrackState *s = bs->opaque; ++ if (s->auto_remove && s->drop_state == DropNone && ++ backing_file == NULL && backing_fmt == NULL) ++ { ++ /* backing file has been disconnected, there's no longer any use for ++ * this node, so let's remove ourselves from the block graph - we need ++ * to schedule this for later however, since when this function is ++ * called, the blockjob modifying us is probably not done yet and has a ++ * blocker on 'bs' */ ++ s->drop_state = DropRequested; ++ bdrv_ref(bs); ++ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs); ++ } ++ ++ return 0; ++} ++ ++static BlockDriver bdrv_alloc_track = { ++ .format_name = "alloc-track", ++ .instance_size = sizeof(BDRVAllocTrackState), ++ ++ .bdrv_file_open = track_open, ++ .bdrv_close = track_close, ++ .bdrv_getlength = track_getlength, ++ .bdrv_child_perm = track_child_perm, ++ .bdrv_refresh_limits = track_refresh_limits, ++ ++ .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes, ++ .bdrv_co_pwritev = track_co_pwritev, ++ .bdrv_co_preadv = track_co_preadv, ++ .bdrv_co_pdiscard = track_co_pdiscard, ++ ++ .bdrv_co_flush = track_co_flush, ++ .bdrv_co_flush_to_disk = track_co_flush, ++ ++ .supports_backing = true, ++ ++ .bdrv_co_block_status = track_co_block_status, ++ .bdrv_change_backing_file = track_change_backing_file, ++}; ++ ++static void bdrv_alloc_track_init(void) ++{ ++ bdrv_register(&bdrv_alloc_track); ++} ++ ++block_init(bdrv_alloc_track_init); +diff --git a/block/meson.build b/block/meson.build +index a070060e53..e387990764 100644 +--- a/block/meson.build ++++ b/block/meson.build +@@ -2,6 +2,7 @@ block_ss.add(genh) + block_ss.add(files( + 'accounting.c', + 'aio_task.c', ++ 'alloc-track.c', + 'amend.c', + 'backup.c', + 'backup-dump.c', diff --git a/debian/patches/series b/debian/patches/series index f29a15d..da4f9c7 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -56,3 +56,7 @@ pve/0040-migration-block-dirty-bitmap-migrate-other-bitmaps-e.patch pve/0041-PVE-fall-back-to-open-iscsi-initiatorname.patch pve/0042-PVE-Use-coroutine-QMP-for-backup-cancel_backup.patch pve/0043-PBS-add-master-key-support.patch +pve/0044-PVE-block-pbs-fast-path-reads-without-allocation-if-.patch +pve/0045-PVE-block-stream-increase-chunk-size.patch +pve/0046-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch +pve/0047-block-add-alloc-track-driver.patch