From 9353db47c5e2f849bc1130e5ed6b40ce97798289 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:34 +0300 Subject: [PATCH 01/21] qcow2.h: add missing include qcow2.h depends on block_int.h. Compilation isn't broken currently only due to block_int.h always included before qcow2.h. Though, it seems better to directly include block_int.h in qcow2.h. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 20190506142741.41731-2-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/qcow2-bitmap.c | 1 - block/qcow2-cache.c | 1 - block/qcow2-cluster.c | 1 - block/qcow2-refcount.c | 1 - block/qcow2-snapshot.c | 1 - block/qcow2.c | 1 - block/qcow2.h | 1 + 7 files changed, 1 insertion(+), 6 deletions(-) diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c index 8a75366c92..640da68ce1 100644 --- a/block/qcow2-bitmap.c +++ b/block/qcow2-bitmap.c @@ -29,7 +29,6 @@ #include "qapi/error.h" #include "qemu/cutils.h" -#include "block/block_int.h" #include "qcow2.h" /* NOTICE: BME here means Bitmaps Extension and used as a namespace for diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index df02e7b20a..b33bcbc984 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -23,7 +23,6 @@ */ #include "qemu/osdep.h" -#include "block/block_int.h" #include "qemu-common.h" #include "qcow2.h" #include "trace.h" diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index b36f4aa84a..a4c878398e 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -27,7 +27,6 @@ #include "qapi/error.h" #include "qemu-common.h" -#include "block/block_int.h" #include "qcow2.h" #include "qemu/bswap.h" #include "trace.h" diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 0b09d6838b..4c1794f9af 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -25,7 +25,6 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu-common.h" -#include "block/block_int.h" #include "qcow2.h" #include "qemu/range.h" #include "qemu/bswap.h" diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index a6ffae89a6..d0e7fa9311 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -24,7 +24,6 @@ #include "qemu/osdep.h" #include "qapi/error.h" -#include "block/block_int.h" #include "qcow2.h" #include "qemu/bswap.h" #include "qemu/error-report.h" diff --git a/block/qcow2.c b/block/qcow2.c index d39882785d..38bf729a1e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -27,7 +27,6 @@ #define ZLIB_CONST #include -#include "block/block_int.h" #include "block/qdict.h" #include "sysemu/block-backend.h" #include "qemu/module.h" diff --git a/block/qcow2.h b/block/qcow2.h index 8d92ef1fee..c2b0e955eb 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -28,6 +28,7 @@ #include "crypto/block.h" #include "qemu/coroutine.h" #include "qemu/units.h" +#include "block/block_int.h" //#define DEBUG_ALLOC //#define DEBUG_ALLOC2 From 56e2f1d898849784c25928d49b5143e5e9b71d08 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:35 +0300 Subject: [PATCH 02/21] qcow2: add separate file for threaded data processing functions Move compression-on-threads to separate file. Encryption will be in it too. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 20190506142741.41731-3-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/Makefile.objs | 2 +- block/qcow2-threads.c | 201 ++++++++++++++++++++++++++++++++++++++++++ block/qcow2.c | 169 ----------------------------------- block/qcow2.h | 7 ++ 4 files changed, 209 insertions(+), 170 deletions(-) create mode 100644 block/qcow2-threads.c diff --git a/block/Makefile.objs b/block/Makefile.objs index 7a81892a52..ae11605c9f 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -6,7 +6,7 @@ block-obj-$(CONFIG_BOCHS) += bochs.o block-obj-$(CONFIG_VVFAT) += vvfat.o block-obj-$(CONFIG_DMG) += dmg.o -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-bitmap.o +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-bitmap.o qcow2-threads.o block-obj-$(CONFIG_QED) += qed.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-$(CONFIG_QED) += qed-check.o block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c new file mode 100644 index 0000000000..85b8034fd2 --- /dev/null +++ b/block/qcow2-threads.c @@ -0,0 +1,201 @@ +/* + * Threaded data processing for Qcow2: compression, encryption + * + * Copyright (c) 2004-2006 Fabrice Bellard + * Copyright (c) 2018 Virtuozzo International GmbH. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" + +#define ZLIB_CONST +#include + +#include "qcow2.h" +#include "block/thread-pool.h" + +#define MAX_COMPRESS_THREADS 4 + +typedef ssize_t (*Qcow2CompressFunc)(void *dest, size_t dest_size, + const void *src, size_t src_size); +typedef struct Qcow2CompressData { + void *dest; + size_t dest_size; + const void *src; + size_t src_size; + ssize_t ret; + + Qcow2CompressFunc func; +} Qcow2CompressData; + +/* + * qcow2_compress() + * + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes + * + * Returns: compressed size on success + * -ENOMEM destination buffer is not enough to store compressed data + * -EIO on any other error + */ +static ssize_t qcow2_compress(void *dest, size_t dest_size, + const void *src, size_t src_size) +{ + ssize_t ret; + z_stream strm; + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, + -12, 9, Z_DEFAULT_STRATEGY); + if (ret != Z_OK) { + return -EIO; + } + + /* + * strm.next_in is not const in old zlib versions, such as those used on + * OpenBSD/NetBSD, so cast the const away + */ + strm.avail_in = src_size; + strm.next_in = (void *) src; + strm.avail_out = dest_size; + strm.next_out = dest; + + ret = deflate(&strm, Z_FINISH); + if (ret == Z_STREAM_END) { + ret = dest_size - strm.avail_out; + } else { + ret = (ret == Z_OK ? -ENOMEM : -EIO); + } + + deflateEnd(&strm); + + return ret; +} + +/* + * qcow2_decompress() + * + * Decompress some data (not more than @src_size bytes) to produce exactly + * @dest_size bytes. + * + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes + * + * Returns: 0 on success + * -1 on fail + */ +static ssize_t qcow2_decompress(void *dest, size_t dest_size, + const void *src, size_t src_size) +{ + int ret = 0; + z_stream strm; + + memset(&strm, 0, sizeof(strm)); + strm.avail_in = src_size; + strm.next_in = (void *) src; + strm.avail_out = dest_size; + strm.next_out = dest; + + ret = inflateInit2(&strm, -12); + if (ret != Z_OK) { + return -1; + } + + ret = inflate(&strm, Z_FINISH); + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || strm.avail_out != 0) { + /* + * We approve Z_BUF_ERROR because we need @dest buffer to be filled, but + * @src buffer may be processed partly (because in qcow2 we know size of + * compressed data with precision of one sector) + */ + ret = -1; + } + + inflateEnd(&strm); + + return ret; +} + +static int qcow2_compress_pool_func(void *opaque) +{ + Qcow2CompressData *data = opaque; + + data->ret = data->func(data->dest, data->dest_size, + data->src, data->src_size); + + return 0; +} + +static void qcow2_compress_complete(void *opaque, int ret) +{ + qemu_coroutine_enter(opaque); +} + +static ssize_t coroutine_fn +qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size, Qcow2CompressFunc func) +{ + BDRVQcow2State *s = bs->opaque; + BlockAIOCB *acb; + ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + Qcow2CompressData arg = { + .dest = dest, + .dest_size = dest_size, + .src = src, + .src_size = src_size, + .func = func, + }; + + while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) { + qemu_co_queue_wait(&s->compress_wait_queue, NULL); + } + + s->nb_compress_threads++; + acb = thread_pool_submit_aio(pool, qcow2_compress_pool_func, &arg, + qcow2_compress_complete, + qemu_coroutine_self()); + + if (!acb) { + s->nb_compress_threads--; + return -EINVAL; + } + qemu_coroutine_yield(); + s->nb_compress_threads--; + qemu_co_queue_next(&s->compress_wait_queue); + + return arg.ret; +} + +ssize_t coroutine_fn +qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size) +{ + return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, + qcow2_compress); +} + +ssize_t coroutine_fn +qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size) +{ + return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, + qcow2_decompress); +} diff --git a/block/qcow2.c b/block/qcow2.c index 38bf729a1e..a8c4b11539 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -24,9 +24,6 @@ #include "qemu/osdep.h" -#define ZLIB_CONST -#include - #include "block/qdict.h" #include "sysemu/block-backend.h" #include "qemu/module.h" @@ -43,7 +40,6 @@ #include "qapi/qobject-input-visitor.h" #include "qapi/qapi-visit-block-core.h" #include "crypto.h" -#include "block/thread-pool.h" /* Differences with QCOW: @@ -3920,171 +3916,6 @@ fail: return ret; } -/* - * qcow2_compress() - * - * @dest - destination buffer, @dest_size bytes - * @src - source buffer, @src_size bytes - * - * Returns: compressed size on success - * -ENOMEM destination buffer is not enough to store compressed data - * -EIO on any other error - */ -static ssize_t qcow2_compress(void *dest, size_t dest_size, - const void *src, size_t src_size) -{ - ssize_t ret; - z_stream strm; - - /* best compression, small window, no zlib header */ - memset(&strm, 0, sizeof(strm)); - ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, - -12, 9, Z_DEFAULT_STRATEGY); - if (ret != Z_OK) { - return -EIO; - } - - /* strm.next_in is not const in old zlib versions, such as those used on - * OpenBSD/NetBSD, so cast the const away */ - strm.avail_in = src_size; - strm.next_in = (void *) src; - strm.avail_out = dest_size; - strm.next_out = dest; - - ret = deflate(&strm, Z_FINISH); - if (ret == Z_STREAM_END) { - ret = dest_size - strm.avail_out; - } else { - ret = (ret == Z_OK ? -ENOMEM : -EIO); - } - - deflateEnd(&strm); - - return ret; -} - -/* - * qcow2_decompress() - * - * Decompress some data (not more than @src_size bytes) to produce exactly - * @dest_size bytes. - * - * @dest - destination buffer, @dest_size bytes - * @src - source buffer, @src_size bytes - * - * Returns: 0 on success - * -1 on fail - */ -static ssize_t qcow2_decompress(void *dest, size_t dest_size, - const void *src, size_t src_size) -{ - int ret = 0; - z_stream strm; - - memset(&strm, 0, sizeof(strm)); - strm.avail_in = src_size; - strm.next_in = (void *) src; - strm.avail_out = dest_size; - strm.next_out = dest; - - ret = inflateInit2(&strm, -12); - if (ret != Z_OK) { - return -1; - } - - ret = inflate(&strm, Z_FINISH); - if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || strm.avail_out != 0) { - /* We approve Z_BUF_ERROR because we need @dest buffer to be filled, but - * @src buffer may be processed partly (because in qcow2 we know size of - * compressed data with precision of one sector) */ - ret = -1; - } - - inflateEnd(&strm); - - return ret; -} - -#define MAX_COMPRESS_THREADS 4 - -typedef ssize_t (*Qcow2CompressFunc)(void *dest, size_t dest_size, - const void *src, size_t src_size); -typedef struct Qcow2CompressData { - void *dest; - size_t dest_size; - const void *src; - size_t src_size; - ssize_t ret; - - Qcow2CompressFunc func; -} Qcow2CompressData; - -static int qcow2_compress_pool_func(void *opaque) -{ - Qcow2CompressData *data = opaque; - - data->ret = data->func(data->dest, data->dest_size, - data->src, data->src_size); - - return 0; -} - -static void qcow2_compress_complete(void *opaque, int ret) -{ - qemu_coroutine_enter(opaque); -} - -static ssize_t coroutine_fn -qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, - const void *src, size_t src_size, Qcow2CompressFunc func) -{ - BDRVQcow2State *s = bs->opaque; - BlockAIOCB *acb; - ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - Qcow2CompressData arg = { - .dest = dest, - .dest_size = dest_size, - .src = src, - .src_size = src_size, - .func = func, - }; - - while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) { - qemu_co_queue_wait(&s->compress_wait_queue, NULL); - } - - s->nb_compress_threads++; - acb = thread_pool_submit_aio(pool, qcow2_compress_pool_func, &arg, - qcow2_compress_complete, - qemu_coroutine_self()); - - if (!acb) { - s->nb_compress_threads--; - return -EINVAL; - } - qemu_coroutine_yield(); - s->nb_compress_threads--; - qemu_co_queue_next(&s->compress_wait_queue); - - return arg.ret; -} - -static ssize_t coroutine_fn -qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, - const void *src, size_t src_size) -{ - return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, - qcow2_compress); -} - -static ssize_t coroutine_fn -qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, - const void *src, size_t src_size) -{ - return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, - qcow2_decompress); -} - /* XXX: put compressed sectors first, then all the cluster aligned tables to avoid losing bytes in alignment */ static coroutine_fn int diff --git a/block/qcow2.h b/block/qcow2.h index c2b0e955eb..8207ed374e 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -738,4 +738,11 @@ void qcow2_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, Error **errp); +ssize_t coroutine_fn +qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size); +ssize_t coroutine_fn +qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size); + #endif From 269062efc8c665707ef88d425d383f4223ce1b11 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:36 +0300 Subject: [PATCH 03/21] qcow2-threads: use thread_pool_submit_co Use thread_pool_submit_co, instead of reinventing it here. Note, that thread_pool_submit_aio() never returns NULL, so checking it was an extra thing. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 20190506142741.41731-4-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/qcow2-threads.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 85b8034fd2..5a39fdac69 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -144,17 +144,11 @@ static int qcow2_compress_pool_func(void *opaque) return 0; } -static void qcow2_compress_complete(void *opaque, int ret) -{ - qemu_coroutine_enter(opaque); -} - static ssize_t coroutine_fn qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, const void *src, size_t src_size, Qcow2CompressFunc func) { BDRVQcow2State *s = bs->opaque; - BlockAIOCB *acb; ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); Qcow2CompressData arg = { .dest = dest, @@ -169,16 +163,9 @@ qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, } s->nb_compress_threads++; - acb = thread_pool_submit_aio(pool, qcow2_compress_pool_func, &arg, - qcow2_compress_complete, - qemu_coroutine_self()); - - if (!acb) { - s->nb_compress_threads--; - return -EINVAL; - } - qemu_coroutine_yield(); + thread_pool_submit_co(pool, qcow2_compress_pool_func, &arg); s->nb_compress_threads--; + qemu_co_queue_next(&s->compress_wait_queue); return arg.ret; From 0f5636c51c76fed01e2ed02fbb7a46da869f0489 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:37 +0300 Subject: [PATCH 04/21] qcow2-threads: qcow2_co_do_compress: protect queuing by mutex Drop dependence on AioContext lock. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Reviewed-by: Paolo Bonzini Reviewed-by: Max Reitz Message-id: 20190506142741.41731-5-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/qcow2-threads.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 5a39fdac69..50a9fdeec5 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -158,15 +158,19 @@ qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, .func = func, }; + qemu_co_mutex_lock(&s->lock); while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) { - qemu_co_queue_wait(&s->compress_wait_queue, NULL); + qemu_co_queue_wait(&s->compress_wait_queue, &s->lock); } - s->nb_compress_threads++; - thread_pool_submit_co(pool, qcow2_compress_pool_func, &arg); - s->nb_compress_threads--; + qemu_co_mutex_unlock(&s->lock); + thread_pool_submit_co(pool, qcow2_compress_pool_func, &arg); + + qemu_co_mutex_lock(&s->lock); + s->nb_compress_threads--; qemu_co_queue_next(&s->compress_wait_queue); + qemu_co_mutex_unlock(&s->lock); return arg.ret; } From 6f13a316dd3a6916de4733e56585127445b19458 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:38 +0300 Subject: [PATCH 05/21] qcow2-threads: split out generic path Move generic part out of qcow2_co_do_compress, to reuse it for encryption and rename things that would be shared with encryption path. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 20190506142741.41731-6-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/qcow2-threads.c | 47 ++++++++++++++++++++++++++++--------------- block/qcow2.c | 2 +- block/qcow2.h | 4 ++-- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 50a9fdeec5..650aa2ed19 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -31,7 +31,36 @@ #include "qcow2.h" #include "block/thread-pool.h" -#define MAX_COMPRESS_THREADS 4 +#define QCOW2_MAX_THREADS 4 + +static int coroutine_fn +qcow2_co_process(BlockDriverState *bs, ThreadPoolFunc *func, void *arg) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + + qemu_co_mutex_lock(&s->lock); + while (s->nb_threads >= QCOW2_MAX_THREADS) { + qemu_co_queue_wait(&s->thread_task_queue, &s->lock); + } + s->nb_threads++; + qemu_co_mutex_unlock(&s->lock); + + ret = thread_pool_submit_co(pool, func, arg); + + qemu_co_mutex_lock(&s->lock); + s->nb_threads--; + qemu_co_queue_next(&s->thread_task_queue); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + + +/* + * Compression + */ typedef ssize_t (*Qcow2CompressFunc)(void *dest, size_t dest_size, const void *src, size_t src_size); @@ -148,8 +177,6 @@ static ssize_t coroutine_fn qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, const void *src, size_t src_size, Qcow2CompressFunc func) { - BDRVQcow2State *s = bs->opaque; - ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); Qcow2CompressData arg = { .dest = dest, .dest_size = dest_size, @@ -158,19 +185,7 @@ qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, .func = func, }; - qemu_co_mutex_lock(&s->lock); - while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) { - qemu_co_queue_wait(&s->compress_wait_queue, &s->lock); - } - s->nb_compress_threads++; - qemu_co_mutex_unlock(&s->lock); - - thread_pool_submit_co(pool, qcow2_compress_pool_func, &arg); - - qemu_co_mutex_lock(&s->lock); - s->nb_compress_threads--; - qemu_co_queue_next(&s->compress_wait_queue); - qemu_co_mutex_unlock(&s->lock); + qcow2_co_process(bs, qcow2_compress_pool_func, &arg); return arg.ret; } diff --git a/block/qcow2.c b/block/qcow2.c index a8c4b11539..1413df1389 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1693,7 +1693,7 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, } #endif - qemu_co_queue_init(&s->compress_wait_queue); + qemu_co_queue_init(&s->thread_task_queue); return ret; diff --git a/block/qcow2.h b/block/qcow2.h index 8207ed374e..637552e137 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -350,8 +350,8 @@ typedef struct BDRVQcow2State { char *image_backing_format; char *image_data_file; - CoQueue compress_wait_queue; - int nb_compress_threads; + CoQueue thread_task_queue; + int nb_threads; BdrvChild *data_file; } BDRVQcow2State; From f24196d388ec781ce3ae99c1fb78ba9755785f91 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:39 +0300 Subject: [PATCH 06/21] qcow2: qcow2_co_preadv: improve locking Background: decryption will be done in threads, to take benefit of it, we should move it out of the lock first. But let's go further: it turns out, that only qcow2_get_cluster_offset() needs locking, so reduce locking to it. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-id: 20190506142741.41731-7-vsementsov@virtuozzo.com Reviewed-by: Alberto Garcia Signed-off-by: Max Reitz --- block/qcow2.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 1413df1389..4a89850389 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1969,8 +1969,6 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, qemu_iovec_init(&hd_qiov, qiov->niov); - qemu_co_mutex_lock(&s->lock); - while (bytes != 0) { /* prepare next request */ @@ -1980,7 +1978,9 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); } + qemu_co_mutex_lock(&s->lock); ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset); + qemu_co_mutex_unlock(&s->lock); if (ret < 0) { goto fail; } @@ -1995,10 +1995,8 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, if (bs->backing) { BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); - qemu_co_mutex_unlock(&s->lock); ret = bdrv_co_preadv(bs->backing, offset, cur_bytes, &hd_qiov, 0); - qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; } @@ -2014,11 +2012,9 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, break; case QCOW2_CLUSTER_COMPRESSED: - qemu_co_mutex_unlock(&s->lock); ret = qcow2_co_preadv_compressed(bs, cluster_offset, offset, cur_bytes, &hd_qiov); - qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; } @@ -2055,11 +2051,9 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, } BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - qemu_co_mutex_unlock(&s->lock); ret = bdrv_co_preadv(s->data_file, cluster_offset + offset_in_cluster, cur_bytes, &hd_qiov, 0); - qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; } @@ -2094,8 +2088,6 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, ret = 0; fail: - qemu_co_mutex_unlock(&s->lock); - qemu_iovec_destroy(&hd_qiov); qemu_vfree(cluster_data); From 5447c3a03f6775a99da6e36cc7eda09f293d7521 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:40 +0300 Subject: [PATCH 07/21] qcow2: bdrv_co_pwritev: move encryption code out of the lock Encryption will be done in threads, to take benefit of it, we should move it out of the lock first. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 20190506142741.41731-8-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/qcow2.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 4a89850389..4b4a0ce6e0 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2168,11 +2168,20 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, &cluster_offset, &l2meta); if (ret < 0) { - goto fail; + goto out_locked; } assert((cluster_offset & 511) == 0); + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + offset_in_cluster, + cur_bytes, true); + if (ret < 0) { + goto out_locked; + } + + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_reset(&hd_qiov); qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes); @@ -2184,7 +2193,7 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, * s->cluster_size); if (cluster_data == NULL) { ret = -ENOMEM; - goto fail; + goto out_unlocked; } } @@ -2199,40 +2208,34 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, cluster_data, cur_bytes, NULL) < 0) { ret = -EIO; - goto fail; + goto out_unlocked; } qemu_iovec_reset(&hd_qiov); qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes); } - ret = qcow2_pre_write_overlap_check(bs, 0, - cluster_offset + offset_in_cluster, cur_bytes, true); - if (ret < 0) { - goto fail; - } - /* If we need to do COW, check if it's possible to merge the * writing of the guest data together with that of the COW regions. * If it's not possible (or not necessary) then write the * guest data now. */ if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) { - qemu_co_mutex_unlock(&s->lock); BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), cluster_offset + offset_in_cluster); ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster, cur_bytes, &hd_qiov, 0); - qemu_co_mutex_lock(&s->lock); if (ret < 0) { - goto fail; + goto out_unlocked; } } + qemu_co_mutex_lock(&s->lock); + ret = qcow2_handle_l2meta(bs, &l2meta, true); if (ret) { - goto fail; + goto out_locked; } bytes -= cur_bytes; @@ -2241,8 +2244,12 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes); } ret = 0; + goto out_locked; -fail: +out_unlocked: + qemu_co_mutex_lock(&s->lock); + +out_locked: qcow2_handle_l2meta(bs, &l2meta, false); qemu_co_mutex_unlock(&s->lock); From 8ac0f15f335b8b58e974fb7a9d193ba56ea18542 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 6 May 2019 17:27:41 +0300 Subject: [PATCH 08/21] qcow2: do encryption in threads Do encryption/decryption in threads, like it is already done for compression. This improves asynchronous encrypted io. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 20190506142741.41731-9-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/qcow2-cluster.c | 7 ++--- block/qcow2-threads.c | 65 +++++++++++++++++++++++++++++++++++++++++-- block/qcow2.c | 22 +++++---------- block/qcow2.h | 8 ++++++ 4 files changed, 81 insertions(+), 21 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index a4c878398e..4a929900cf 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -471,13 +471,12 @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, { if (bytes && bs->encrypted) { BDRVQcow2State *s = bs->opaque; - int64_t offset = (s->crypt_physical_offset ? - (cluster_offset + offset_in_cluster) : - (src_cluster_offset + offset_in_cluster)); assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); assert((bytes & ~BDRV_SECTOR_MASK) == 0); assert(s->crypto); - if (qcrypto_block_encrypt(s->crypto, offset, buffer, bytes, NULL) < 0) { + if (qcow2_co_encrypt(bs, cluster_offset, + src_cluster_offset + offset_in_cluster, + buffer, bytes) < 0) { return false; } } diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 650aa2ed19..3b1e63fe41 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -30,8 +30,7 @@ #include "qcow2.h" #include "block/thread-pool.h" - -#define QCOW2_MAX_THREADS 4 +#include "crypto.h" static int coroutine_fn qcow2_co_process(BlockDriverState *bs, ThreadPoolFunc *func, void *arg) @@ -205,3 +204,65 @@ qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, qcow2_decompress); } + + +/* + * Cryptography + */ + +/* + * Qcow2EncDecFunc: common prototype of qcrypto_block_encrypt() and + * qcrypto_block_decrypt() functions. + */ +typedef int (*Qcow2EncDecFunc)(QCryptoBlock *block, uint64_t offset, + uint8_t *buf, size_t len, Error **errp); + +typedef struct Qcow2EncDecData { + QCryptoBlock *block; + uint64_t offset; + uint8_t *buf; + size_t len; + + Qcow2EncDecFunc func; +} Qcow2EncDecData; + +static int qcow2_encdec_pool_func(void *opaque) +{ + Qcow2EncDecData *data = opaque; + + return data->func(data->block, data->offset, data->buf, data->len, NULL); +} + +static int coroutine_fn +qcow2_co_encdec(BlockDriverState *bs, uint64_t file_cluster_offset, + uint64_t offset, void *buf, size_t len, Qcow2EncDecFunc func) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2EncDecData arg = { + .block = s->crypto, + .offset = s->crypt_physical_offset ? + file_cluster_offset + offset_into_cluster(s, offset) : + offset, + .buf = buf, + .len = len, + .func = func, + }; + + return qcow2_co_process(bs, qcow2_encdec_pool_func, &arg); +} + +int coroutine_fn +qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset, + uint64_t offset, void *buf, size_t len) +{ + return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len, + qcrypto_block_encrypt); +} + +int coroutine_fn +qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset, + uint64_t offset, void *buf, size_t len) +{ + return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len, + qcrypto_block_decrypt); +} diff --git a/block/qcow2.c b/block/qcow2.c index 4b4a0ce6e0..dea765b2f4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -297,7 +297,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, } s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", qcow2_crypto_hdr_read_func, - bs, cflags, 1, errp); + bs, cflags, QCOW2_MAX_THREADS, errp); if (!s->crypto) { return -EINVAL; } @@ -1538,7 +1538,8 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; } s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", - NULL, NULL, cflags, 1, errp); + NULL, NULL, cflags, + QCOW2_MAX_THREADS, errp); if (!s->crypto) { ret = -EINVAL; goto fail; @@ -2061,13 +2062,8 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, assert(s->crypto); assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - if (qcrypto_block_decrypt(s->crypto, - (s->crypt_physical_offset ? - cluster_offset + offset_in_cluster : - offset), - cluster_data, - cur_bytes, - NULL) < 0) { + if (qcow2_co_decrypt(bs, cluster_offset, offset, + cluster_data, cur_bytes) < 0) { ret = -EIO; goto fail; } @@ -2201,12 +2197,8 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); - if (qcrypto_block_encrypt(s->crypto, - (s->crypt_physical_offset ? - cluster_offset + offset_in_cluster : - offset), - cluster_data, - cur_bytes, NULL) < 0) { + if (qcow2_co_encrypt(bs, cluster_offset, offset, + cluster_data, cur_bytes) < 0) { ret = -EIO; goto out_unlocked; } diff --git a/block/qcow2.h b/block/qcow2.h index 637552e137..e873ae5d12 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -268,6 +268,8 @@ typedef struct Qcow2BitmapHeaderExt { uint64_t bitmap_directory_offset; } QEMU_PACKED Qcow2BitmapHeaderExt; +#define QCOW2_MAX_THREADS 4 + typedef struct BDRVQcow2State { int cluster_bits; int cluster_size; @@ -744,5 +746,11 @@ qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, ssize_t coroutine_fn qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, const void *src, size_t src_size); +int coroutine_fn +qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset, + uint64_t offset, void *buf, size_t len); +int coroutine_fn +qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset, + uint64_t offset, void *buf, size_t len); #endif From c2da3413c021398152e98022261bb1643276a2fe Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 29 Apr 2019 12:08:38 +0300 Subject: [PATCH 09/21] block/backup: simplify backup_incremental_init_copy_bitmap Simplify backup_incremental_init_copy_bitmap using the function bdrv_dirty_bitmap_next_dirty_area. Note: move to job->len instead of bitmap size: it should not matter but less code. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Max Reitz Message-id: 20190429090842.57910-2-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/backup.c | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/block/backup.c b/block/backup.c index 916817d8b1..db83b09a0b 100644 --- a/block/backup.c +++ b/block/backup.c @@ -394,43 +394,27 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) /* init copy_bitmap from sync_bitmap */ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job) { - BdrvDirtyBitmapIter *dbi; - int64_t offset; - int64_t end = DIV_ROUND_UP(bdrv_dirty_bitmap_size(job->sync_bitmap), - job->cluster_size); + uint64_t offset = 0; + uint64_t bytes = job->len; - dbi = bdrv_dirty_iter_new(job->sync_bitmap); - while ((offset = bdrv_dirty_iter_next(dbi)) != -1) { - int64_t cluster = offset / job->cluster_size; - int64_t next_cluster; + while (bdrv_dirty_bitmap_next_dirty_area(job->sync_bitmap, + &offset, &bytes)) + { + uint64_t cluster = offset / job->cluster_size; + uint64_t end_cluster = DIV_ROUND_UP(offset + bytes, job->cluster_size); - offset += bdrv_dirty_bitmap_granularity(job->sync_bitmap); - if (offset >= bdrv_dirty_bitmap_size(job->sync_bitmap)) { - hbitmap_set(job->copy_bitmap, cluster, end - cluster); + hbitmap_set(job->copy_bitmap, cluster, end_cluster - cluster); + + offset = end_cluster * job->cluster_size; + if (offset >= job->len) { break; } - - offset = bdrv_dirty_bitmap_next_zero(job->sync_bitmap, offset, - UINT64_MAX); - if (offset == -1) { - hbitmap_set(job->copy_bitmap, cluster, end - cluster); - break; - } - - next_cluster = DIV_ROUND_UP(offset, job->cluster_size); - hbitmap_set(job->copy_bitmap, cluster, next_cluster - cluster); - if (next_cluster >= end) { - break; - } - - bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size); + bytes = job->len - offset; } /* TODO job_progress_set_remaining() would make more sense */ job_progress_update(&job->common.job, job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size); - - bdrv_dirty_iter_free(dbi); } static int coroutine_fn backup_run(Job *job, Error **errp) From a8389e315ef71913ae99cf8f3b1f89e84631f599 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 29 Apr 2019 12:08:39 +0300 Subject: [PATCH 10/21] block/backup: move to copy_bitmap with granularity We are going to share this bitmap between backup and backup-top filter driver, so let's share something more meaningful. It also simplifies some calculations. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Max Reitz Message-id: 20190429090842.57910-3-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/backup.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/block/backup.c b/block/backup.c index db83b09a0b..98a2d2b070 100644 --- a/block/backup.c +++ b/block/backup.c @@ -112,7 +112,8 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0; int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0; - hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1); + assert(QEMU_IS_ALIGNED(start, job->cluster_size)); + hbitmap_reset(job->copy_bitmap, start, job->cluster_size); nbytes = MIN(job->cluster_size, job->len - start); if (!*bounce_buffer) { *bounce_buffer = blk_blockalign(blk, job->cluster_size); @@ -145,7 +146,7 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, return nbytes; fail: - hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); + hbitmap_set(job->copy_bitmap, start, job->cluster_size); return ret; } @@ -165,16 +166,15 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0; assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size)); + assert(QEMU_IS_ALIGNED(start, job->cluster_size)); nbytes = MIN(job->copy_range_size, end - start); nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size); - hbitmap_reset(job->copy_bitmap, start / job->cluster_size, - nr_clusters); + hbitmap_reset(job->copy_bitmap, start, job->cluster_size * nr_clusters); ret = blk_co_copy_range(blk, start, job->target, start, nbytes, read_flags, write_flags); if (ret < 0) { trace_backup_do_cow_copy_range_fail(job, start, ret); - hbitmap_set(job->copy_bitmap, start / job->cluster_size, - nr_clusters); + hbitmap_set(job->copy_bitmap, start, job->cluster_size * nr_clusters); return ret; } @@ -202,7 +202,7 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job, cow_request_begin(&cow_request, job, start, end); while (start < end) { - if (!hbitmap_get(job->copy_bitmap, start / job->cluster_size)) { + if (!hbitmap_get(job->copy_bitmap, start)) { trace_backup_do_cow_skip(job, start); start += job->cluster_size; continue; /* already copied */ @@ -298,12 +298,16 @@ static void backup_clean(Job *job) assert(s->target); blk_unref(s->target); s->target = NULL; + + if (s->copy_bitmap) { + hbitmap_free(s->copy_bitmap); + s->copy_bitmap = NULL; + } } void backup_do_checkpoint(BlockJob *job, Error **errp) { BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); - int64_t len; assert(block_job_driver(job) == &backup_job_driver); @@ -313,8 +317,7 @@ void backup_do_checkpoint(BlockJob *job, Error **errp) return; } - len = DIV_ROUND_UP(backup_job->len, backup_job->cluster_size); - hbitmap_set(backup_job->copy_bitmap, 0, len); + hbitmap_set(backup_job->copy_bitmap, 0, backup_job->len); } static void backup_drain(BlockJob *job) @@ -369,16 +372,16 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) { int ret; bool error_is_read; - int64_t cluster; + int64_t offset; HBitmapIter hbi; hbitmap_iter_init(&hbi, job->copy_bitmap, 0); - while ((cluster = hbitmap_iter_next(&hbi)) != -1) { + while ((offset = hbitmap_iter_next(&hbi)) != -1) { do { if (yield_and_check(job)) { return 0; } - ret = backup_do_cow(job, cluster * job->cluster_size, + ret = backup_do_cow(job, offset, job->cluster_size, &error_is_read, false); if (ret < 0 && backup_error_action(job, error_is_read, -ret) == BLOCK_ERROR_ACTION_REPORT) @@ -400,12 +403,9 @@ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job) while (bdrv_dirty_bitmap_next_dirty_area(job->sync_bitmap, &offset, &bytes)) { - uint64_t cluster = offset / job->cluster_size; - uint64_t end_cluster = DIV_ROUND_UP(offset + bytes, job->cluster_size); + hbitmap_set(job->copy_bitmap, offset, bytes); - hbitmap_set(job->copy_bitmap, cluster, end_cluster - cluster); - - offset = end_cluster * job->cluster_size; + offset += bytes; if (offset >= job->len) { break; } @@ -414,30 +414,27 @@ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job) /* TODO job_progress_set_remaining() would make more sense */ job_progress_update(&job->common.job, - job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size); + job->len - hbitmap_count(job->copy_bitmap)); } static int coroutine_fn backup_run(Job *job, Error **errp) { BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); BlockDriverState *bs = blk_bs(s->common.blk); - int64_t offset, nb_clusters; + int64_t offset; int ret = 0; QLIST_INIT(&s->inflight_reqs); qemu_co_rwlock_init(&s->flush_rwlock); - nb_clusters = DIV_ROUND_UP(s->len, s->cluster_size); job_progress_set_remaining(job, s->len); - s->copy_bitmap = hbitmap_alloc(nb_clusters, 0); if (s->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { backup_incremental_init_copy_bitmap(s); } else { - hbitmap_set(s->copy_bitmap, 0, nb_clusters); + hbitmap_set(s->copy_bitmap, 0, s->len); } - s->before_write.notify = backup_before_write_notify; bdrv_add_before_write_notifier(bs, &s->before_write); @@ -518,7 +515,6 @@ static int coroutine_fn backup_run(Job *job, Error **errp) /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(&s->flush_rwlock); qemu_co_rwlock_unlock(&s->flush_rwlock); - hbitmap_free(s->copy_bitmap); return ret; } @@ -668,6 +664,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, } else { job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); } + + job->copy_bitmap = hbitmap_alloc(len, ctz32(job->cluster_size)); job->use_copy_range = true; job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk), blk_get_max_transfer(job->target)); From 9eb5a248f3e50c1f034bc6ff4b2f25c8c56515a5 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 29 Apr 2019 12:08:40 +0300 Subject: [PATCH 11/21] block/backup: refactor and tolerate unallocated cluster skipping Split allocation checking to separate function and reduce nesting. Consider bdrv_is_allocated() fail as allocated area, as copying more than needed is not wrong (and we do it anyway) and seems better than fail the whole job. And, most probably we will fail on the next read, if there are real problem with source. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Max Reitz Message-id: 20190429090842.57910-4-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/backup.c | 60 +++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/block/backup.c b/block/backup.c index 98a2d2b070..78f1b79354 100644 --- a/block/backup.c +++ b/block/backup.c @@ -368,6 +368,22 @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job) return false; } +static bool bdrv_is_unallocated_range(BlockDriverState *bs, + int64_t offset, int64_t bytes) +{ + int64_t end = offset + bytes; + + while (offset < end && !bdrv_is_allocated(bs, offset, bytes, &bytes)) { + if (bytes == 0) { + return true; + } + offset += bytes; + bytes = end - offset; + } + + return offset >= end; +} + static int coroutine_fn backup_run_incremental(BackupBlockJob *job) { int ret; @@ -453,49 +469,19 @@ static int coroutine_fn backup_run(Job *job, Error **errp) for (offset = 0; offset < s->len; offset += s->cluster_size) { bool error_is_read; - int alloced = 0; if (yield_and_check(s)) { break; } - if (s->sync_mode == MIRROR_SYNC_MODE_TOP) { - int i; - int64_t n; - - /* Check to see if these blocks are already in the - * backing file. */ - - for (i = 0; i < s->cluster_size;) { - /* bdrv_is_allocated() only returns true/false based - * on the first set of sectors it comes across that - * are are all in the same state. - * For that reason we must verify each sector in the - * backup cluster length. We end up copying more than - * needed but at some point that is always the case. */ - alloced = - bdrv_is_allocated(bs, offset + i, - s->cluster_size - i, &n); - i += n; - - if (alloced || n == 0) { - break; - } - } - - /* If the above loop never found any sectors that are in - * the topmost image, skip this backup. */ - if (alloced == 0) { - continue; - } - } - /* FULL sync mode we copy the whole drive. */ - if (alloced < 0) { - ret = alloced; - } else { - ret = backup_do_cow(s, offset, s->cluster_size, - &error_is_read, false); + if (s->sync_mode == MIRROR_SYNC_MODE_TOP && + bdrv_is_unallocated_range(bs, offset, s->cluster_size)) + { + continue; } + + ret = backup_do_cow(s, offset, s->cluster_size, + &error_is_read, false); if (ret < 0) { /* Depending on error action, fail now or retry cluster */ BlockErrorAction action = From c334e897d08eea1f5a3a95f6a2208afe6757c103 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 29 Apr 2019 12:08:41 +0300 Subject: [PATCH 12/21] block/backup: unify different modes code path Do full, top and incremental mode copying all in one place. This unifies the code path and helps further improvements. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Max Reitz Message-id: 20190429090842.57910-5-vsementsov@virtuozzo.com Signed-off-by: Max Reitz --- block/backup.c | 43 ++++++++++--------------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/block/backup.c b/block/backup.c index 78f1b79354..5b3fc9d123 100644 --- a/block/backup.c +++ b/block/backup.c @@ -384,15 +384,23 @@ static bool bdrv_is_unallocated_range(BlockDriverState *bs, return offset >= end; } -static int coroutine_fn backup_run_incremental(BackupBlockJob *job) +static int coroutine_fn backup_loop(BackupBlockJob *job) { int ret; bool error_is_read; int64_t offset; HBitmapIter hbi; + BlockDriverState *bs = blk_bs(job->common.blk); hbitmap_iter_init(&hbi, job->copy_bitmap, 0); while ((offset = hbitmap_iter_next(&hbi)) != -1) { + if (job->sync_mode == MIRROR_SYNC_MODE_TOP && + bdrv_is_unallocated_range(bs, offset, job->cluster_size)) + { + hbitmap_reset(job->copy_bitmap, offset, job->cluster_size); + continue; + } + do { if (yield_and_check(job)) { return 0; @@ -437,7 +445,6 @@ static int coroutine_fn backup_run(Job *job, Error **errp) { BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); BlockDriverState *bs = blk_bs(s->common.blk); - int64_t offset; int ret = 0; QLIST_INIT(&s->inflight_reqs); @@ -462,38 +469,8 @@ static int coroutine_fn backup_run(Job *job, Error **errp) * notify callback service CoW requests. */ job_yield(job); } - } else if (s->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { - ret = backup_run_incremental(s); } else { - /* Both FULL and TOP SYNC_MODE's require copying.. */ - for (offset = 0; offset < s->len; - offset += s->cluster_size) { - bool error_is_read; - - if (yield_and_check(s)) { - break; - } - - if (s->sync_mode == MIRROR_SYNC_MODE_TOP && - bdrv_is_unallocated_range(bs, offset, s->cluster_size)) - { - continue; - } - - ret = backup_do_cow(s, offset, s->cluster_size, - &error_is_read, false); - if (ret < 0) { - /* Depending on error action, fail now or retry cluster */ - BlockErrorAction action = - backup_error_action(s, error_is_read, -ret); - if (action == BLOCK_ERROR_ACTION_REPORT) { - break; - } else { - offset -= s->cluster_size; - continue; - } - } - } + ret = backup_loop(s); } notifier_with_return_remove(&s->before_write); From ae6b12fa4cf7d54add35531c790aaf2bd6d833f3 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 29 Apr 2019 12:08:42 +0300 Subject: [PATCH 13/21] block/backup: refactor: split out backup_calculate_cluster_size Split out cluster_size calculation. Move copy-bitmap creation above block-job creation, as we are going to share it with upcoming backup-top filter, which also should be created before actual block job creation. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-id: 20190429090842.57910-6-vsementsov@virtuozzo.com [mreitz: Dropped a paragraph from the commit message that was left over from a previous version] Signed-off-by: Max Reitz --- block/backup.c | 82 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 30 deletions(-) diff --git a/block/backup.c b/block/backup.c index 5b3fc9d123..00f4f8af53 100644 --- a/block/backup.c +++ b/block/backup.c @@ -497,6 +497,42 @@ static const BlockJobDriver backup_job_driver = { .drain = backup_drain, }; +static int64_t backup_calculate_cluster_size(BlockDriverState *target, + Error **errp) +{ + int ret; + BlockDriverInfo bdi; + + /* + * If there is no backing file on the target, we cannot rely on COW if our + * backup cluster size is smaller than the target cluster size. Even for + * targets with a backing file, try to avoid COW if possible. + */ + ret = bdrv_get_info(target, &bdi); + if (ret == -ENOTSUP && !target->backing) { + /* Cluster size is not defined */ + warn_report("The target block device doesn't provide " + "information about the block size and it doesn't have a " + "backing file. The default block size of %u bytes is " + "used. If the actual block size of the target exceeds " + "this default, the backup may be unusable", + BACKUP_CLUSTER_SIZE_DEFAULT); + return BACKUP_CLUSTER_SIZE_DEFAULT; + } else if (ret < 0 && !target->backing) { + error_setg_errno(errp, -ret, + "Couldn't determine the cluster size of the target image, " + "which has no backing file"); + error_append_hint(errp, + "Aborting, since this may create an unusable destination image\n"); + return ret; + } else if (ret < 0 && target->backing) { + /* Not fatal; just trudge on ahead. */ + return BACKUP_CLUSTER_SIZE_DEFAULT; + } + + return MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); +} + BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, BlockDriverState *target, int64_t speed, MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, @@ -508,9 +544,10 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, JobTxn *txn, Error **errp) { int64_t len; - BlockDriverInfo bdi; BackupBlockJob *job = NULL; int ret; + int64_t cluster_size; + HBitmap *copy_bitmap = NULL; assert(bs); assert(target); @@ -572,6 +609,13 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, goto error; } + cluster_size = backup_calculate_cluster_size(target, errp); + if (cluster_size < 0) { + goto error; + } + + copy_bitmap = hbitmap_alloc(len, ctz32(cluster_size)); + /* job->len is fixed, so we can't allow resize */ job = block_job_create(job_id, &backup_job_driver, txn, bs, BLK_PERM_CONSISTENT_READ, @@ -600,35 +644,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, /* Detect image-fleecing (and similar) schemes */ job->serialize_target_writes = bdrv_chain_contains(target, bs); - - /* If there is no backing file on the target, we cannot rely on COW if our - * backup cluster size is smaller than the target cluster size. Even for - * targets with a backing file, try to avoid COW if possible. */ - ret = bdrv_get_info(target, &bdi); - if (ret == -ENOTSUP && !target->backing) { - /* Cluster size is not defined */ - warn_report("The target block device doesn't provide " - "information about the block size and it doesn't have a " - "backing file. The default block size of %u bytes is " - "used. If the actual block size of the target exceeds " - "this default, the backup may be unusable", - BACKUP_CLUSTER_SIZE_DEFAULT); - job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; - } else if (ret < 0 && !target->backing) { - error_setg_errno(errp, -ret, - "Couldn't determine the cluster size of the target image, " - "which has no backing file"); - error_append_hint(errp, - "Aborting, since this may create an unusable destination image\n"); - goto error; - } else if (ret < 0 && target->backing) { - /* Not fatal; just trudge on ahead. */ - job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; - } else { - job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); - } - - job->copy_bitmap = hbitmap_alloc(len, ctz32(job->cluster_size)); + job->cluster_size = cluster_size; + job->copy_bitmap = copy_bitmap; + copy_bitmap = NULL; job->use_copy_range = true; job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk), blk_get_max_transfer(job->target)); @@ -644,6 +662,10 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, return &job->common; error: + if (copy_bitmap) { + assert(!job || !job->copy_bitmap); + hbitmap_free(copy_bitmap); + } if (sync_bitmap) { bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); } From dd4118c792a8c2a104fe90274e8a41e0db1ebc56 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 13 May 2019 16:46:17 +0300 Subject: [PATCH 14/21] block: Use bdrv_unref_child() for all children in bdrv_close() bdrv_unref_child() does the following things: - Updates the child->bs->inherits_from pointer. - Calls bdrv_detach_child() to remove the BdrvChild from bs->children. - Calls bdrv_unref() to unref the child BlockDriverState. When bdrv_unref_child() was introduced in commit 33a604075c it was not used in bdrv_close() because the drivers that had additional children (like quorum or blkverify) had already called bdrv_unref() on their children during their own close functions. This was changed later (in 0bd6e91a7e for quorum, in 3e586be0b2 for blkverify) so there's no reason not to use bdrv_unref_child() in bdrv_close() anymore. After this there's also no need to remove bs->backing and bs->file separately from the rest of the children, so bdrv_close() can be simplified. Now bdrv_close() unrefs all children (before this patch it was only bs->file and bs->backing). As a result, none of the callers of brvd_attach_child() should remove their reference to child_bs (because this function effectively steals that reference). This patch updates a couple of tests that were doing their own bdrv_unref(). Signed-off-by: Alberto Garcia Message-id: 6d1d5feaa53aa1ab127adb73d605dc4503e3abd5.1557754872.git.berto@igalia.com [mreitz: s/where/were/] Signed-off-by: Max Reitz --- block.c | 16 +++------------- tests/test-bdrv-drain.c | 6 ------ tests/test-bdrv-graph-mod.c | 1 - 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/block.c b/block.c index cb11537029..be37280dc7 100644 --- a/block.c +++ b/block.c @@ -3877,22 +3877,12 @@ static void bdrv_close(BlockDriverState *bs) bs->drv = NULL; } - bdrv_set_backing_hd(bs, NULL, &error_abort); - - if (bs->file != NULL) { - bdrv_unref_child(bs, bs->file); - bs->file = NULL; - } - QLIST_FOREACH_SAFE(child, &bs->children, next, next) { - /* TODO Remove bdrv_unref() from drivers' close function and use - * bdrv_unref_child() here */ - if (child->bs->inherits_from == bs) { - child->bs->inherits_from = NULL; - } - bdrv_detach_child(child); + bdrv_unref_child(bs, child); } + bs->backing = NULL; + bs->file = NULL; g_free(bs->opaque); bs->opaque = NULL; atomic_set(&bs->copy_on_read, 0); diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index eda90750eb..5534c2adf9 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -1436,12 +1436,6 @@ static void test_detach_indirect(bool by_parent_cb) bdrv_unref(parent_b); blk_unref(blk); - /* XXX Once bdrv_close() unref's children instead of just detaching them, - * this won't be necessary any more. */ - bdrv_unref(a); - bdrv_unref(a); - bdrv_unref(c); - g_assert_cmpint(a->refcnt, ==, 1); g_assert_cmpint(b->refcnt, ==, 1); g_assert_cmpint(c->refcnt, ==, 1); diff --git a/tests/test-bdrv-graph-mod.c b/tests/test-bdrv-graph-mod.c index 283dc84869..747c0bf8fc 100644 --- a/tests/test-bdrv-graph-mod.c +++ b/tests/test-bdrv-graph-mod.c @@ -116,7 +116,6 @@ static void test_update_perm_tree(void) g_assert_nonnull(local_err); error_free(local_err); - bdrv_unref(bs); blk_unref(root); } From b441dc71c0b7e8f488a6ebc2aa781b08a3a05038 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 13 May 2019 16:46:18 +0300 Subject: [PATCH 15/21] block: Make bdrv_root_attach_child() unref child_bs on failure A consequence of the previous patch is that bdrv_attach_child() transfers the reference to child_bs from the caller to parent_bs, which will drop it on bdrv_close() or when someone calls bdrv_unref_child(). But this only happens when bdrv_attach_child() succeeds. If it fails then the caller is responsible for dropping the reference to child_bs. This patch makes bdrv_attach_child() take the reference also when there is an error, freeing the caller for having to do it. A similar situation happens with bdrv_root_attach_child(), so the changes on this patch affect both functions. Signed-off-by: Alberto Garcia Message-id: 20dfb3d9ccec559cdd1a9690146abad5d204a186.1557754872.git.berto@igalia.com [mreitz: Removed now superfluous BdrvChild * variable in bdrv_open_child()] Signed-off-by: Max Reitz --- block.c | 30 ++++++++++++++++++------------ block/block-backend.c | 3 +-- block/quorum.c | 1 - blockjob.c | 2 +- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/block.c b/block.c index be37280dc7..1a73e310c1 100644 --- a/block.c +++ b/block.c @@ -2243,6 +2243,13 @@ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs) } } +/* + * This function steals the reference to child_bs from the caller. + * That reference is later dropped by bdrv_root_unref_child(). + * + * On failure NULL is returned, errp is set and the reference to + * child_bs is also dropped. + */ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, const char *child_name, const BdrvChildRole *child_role, @@ -2255,6 +2262,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, ret = bdrv_check_update_perm(child_bs, NULL, perm, shared_perm, NULL, errp); if (ret < 0) { bdrv_abort_perm_update(child_bs); + bdrv_unref(child_bs); return NULL; } @@ -2274,6 +2282,14 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, return child; } +/* + * This function transfers the reference to child_bs from the caller + * to parent_bs. That reference is later dropped by parent_bs on + * bdrv_close() or if someone calls bdrv_unref_child(). + * + * On failure NULL is returned, errp is set and the reference to + * child_bs is also dropped. + */ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, const char *child_name, @@ -2401,12 +2417,9 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, /* If backing_hd was already part of bs's backing chain, and * inherits_from pointed recursively to bs then let's update it to * point directly to bs (else it will become NULL). */ - if (update_inherits_from) { + if (bs->backing && update_inherits_from) { backing_hd->inherits_from = bs; } - if (!bs->backing) { - bdrv_unref(backing_hd); - } out: bdrv_refresh_limits(bs, NULL); @@ -2594,7 +2607,6 @@ BdrvChild *bdrv_open_child(const char *filename, const BdrvChildRole *child_role, bool allow_none, Error **errp) { - BdrvChild *c; BlockDriverState *bs; bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_role, @@ -2603,13 +2615,7 @@ BdrvChild *bdrv_open_child(const char *filename, return NULL; } - c = bdrv_attach_child(parent, bs, bdref_key, child_role, errp); - if (!c) { - bdrv_unref(bs); - return NULL; - } - - return c; + return bdrv_attach_child(parent, bs, bdref_key, child_role, errp); } /* TODO Future callers may need to specify parent/child_role in order for diff --git a/block/block-backend.c b/block/block-backend.c index 4c0a8ef88d..ad3e1c882d 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -392,7 +392,6 @@ BlockBackend *blk_new_open(const char *filename, const char *reference, blk->root = bdrv_root_attach_child(bs, "root", &child_root, perm, BLK_PERM_ALL, blk, errp); if (!blk->root) { - bdrv_unref(bs); blk_unref(blk); return NULL; } @@ -800,12 +799,12 @@ void blk_remove_bs(BlockBackend *blk) int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp) { ThrottleGroupMember *tgm = &blk->public.throttle_group_member; + bdrv_ref(bs); blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->perm, blk->shared_perm, blk, errp); if (blk->root == NULL) { return -EPERM; } - bdrv_ref(bs); notifier_list_notify(&blk->insert_bs_notifiers, blk); if (tgm->throttle_state) { diff --git a/block/quorum.c b/block/quorum.c index 352f729136..133ee18204 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -1019,7 +1019,6 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, child = bdrv_attach_child(bs, child_bs, indexstr, &child_format, errp); if (child == NULL) { s->next_child_index--; - bdrv_unref(child_bs); goto out; } s->children = g_renew(BdrvChild *, s->children, s->num_children + 1); diff --git a/blockjob.c b/blockjob.c index 9ca942ba01..cc5f18e7cd 100644 --- a/blockjob.c +++ b/blockjob.c @@ -204,6 +204,7 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, { BdrvChild *c; + bdrv_ref(bs); c = bdrv_root_attach_child(bs, name, &child_job, perm, shared_perm, job, errp); if (c == NULL) { @@ -211,7 +212,6 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, } job->nodes = g_slist_prepend(job->nodes, c); - bdrv_ref(bs); bdrv_op_block_all(bs, job->blocker); return 0; From 4ebe0617269113ec52a74d3dbb574a62ef6a0c83 Mon Sep 17 00:00:00 2001 From: Sam Eiderman Date: Thu, 23 May 2019 19:33:35 +0300 Subject: [PATCH 16/21] qemu-img: rebase: Reuse parent BlockDriverState In safe mode we open the entire chain, including the parent backing file of the rebased file. Do not open a new BlockBackend for the parent backing file, which saves opening the rest of the chain twice, which for long chains saves many "pricy" bdrv_open() calls. Permissions for blk_new() were copied from blk_new_open() when flags = 0. Reviewed-by: Karl Heubaum Reviewed-by: Eyal Moscovici Signed-off-by: Sagi Amit Co-developed-by: Sagi Amit Signed-off-by: Sam Eiderman Message-id: 20190523163337.4497-2-shmuel.eiderman@oracle.com Signed-off-by: Max Reitz --- qemu-img.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 28fba1e7a7..9bd0bb1e47 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3309,29 +3309,18 @@ static int img_rebase(int argc, char **argv) /* For safe rebasing we need to compare old and new backing file */ if (!unsafe) { - char backing_name[PATH_MAX]; QDict *options = NULL; + BlockDriverState *base_bs = backing_bs(bs); - if (bs->backing) { - if (bs->backing_format[0] != '\0') { - options = qdict_new(); - qdict_put_str(options, "driver", bs->backing_format); - } - - if (force_share) { - if (!options) { - options = qdict_new(); - } - qdict_put_bool(options, BDRV_OPT_FORCE_SHARE, true); - } - bdrv_get_backing_filename(bs, backing_name, sizeof(backing_name)); - blk_old_backing = blk_new_open(backing_name, NULL, - options, src_flags, &local_err); - if (!blk_old_backing) { + if (base_bs) { + blk_old_backing = blk_new(BLK_PERM_CONSISTENT_READ, + BLK_PERM_ALL); + ret = blk_insert_bs(blk_old_backing, base_bs, + &local_err); + if (ret < 0) { error_reportf_err(local_err, - "Could not open old backing file '%s': ", - backing_name); - ret = -1; + "Could not reuse old backing file '%s': ", + base_bs->filename); goto out; } } else { From 863cc78f1b38e926a1ac71348aaba0f6777660ff Mon Sep 17 00:00:00 2001 From: Sam Eiderman Date: Thu, 23 May 2019 19:33:36 +0300 Subject: [PATCH 17/21] qemu-img: rebase: Reduce reads on in-chain rebase In the following case: (base) A <- B <- C (tip) when running: qemu-img rebase -b A C QEMU would read all sectors not allocated in the file being rebased (C) and compare them to the new base image (A), regardless of whether they were changed or even allocated anywhere along the chain between the new base and the top image (B). This causes many unneeded reads when rebasing an image which represents a small diff of a large disk, as it would read most of the disk's sectors. Instead, use bdrv_is_allocated_above() to reduce the number of unnecessary reads. Reviewed-by: Karl Heubaum Signed-off-by: Sam Eiderman Signed-off-by: Eyal Moscovici Message-id: 20190523163337.4497-3-shmuel.eiderman@oracle.com Signed-off-by: Max Reitz --- qemu-img.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/qemu-img.c b/qemu-img.c index 9bd0bb1e47..2d96a491e2 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3164,7 +3164,7 @@ static int img_rebase(int argc, char **argv) BlockBackend *blk = NULL, *blk_old_backing = NULL, *blk_new_backing = NULL; uint8_t *buf_old = NULL; uint8_t *buf_new = NULL; - BlockDriverState *bs = NULL; + BlockDriverState *bs = NULL, *prefix_chain_bs = NULL; char *filename; const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg; int c, flags, src_flags, ret; @@ -3353,6 +3353,12 @@ static int img_rebase(int argc, char **argv) goto out; } + /* + * Find out whether we rebase an image on top of a previous image + * in its chain. + */ + prefix_chain_bs = bdrv_find_backing_image(bs, out_real_path); + blk_new_backing = blk_new_open(out_real_path, NULL, options, src_flags, &local_err); g_free(out_real_path); @@ -3437,6 +3443,23 @@ static int img_rebase(int argc, char **argv) continue; } + if (prefix_chain_bs) { + /* + * If cluster wasn't changed since prefix_chain, we don't need + * to take action + */ + ret = bdrv_is_allocated_above(backing_bs(bs), prefix_chain_bs, + offset, n, &n); + if (ret < 0) { + error_report("error while reading image metadata: %s", + strerror(-ret)); + goto out; + } + if (!ret) { + continue; + } + } + /* * Read old and new backing file and take into consideration that * backing files may be smaller than the COW image. From 330c72957196e0ae382abcaa97ebf4eb9bc8574f Mon Sep 17 00:00:00 2001 From: Sam Eiderman Date: Thu, 23 May 2019 19:33:37 +0300 Subject: [PATCH 18/21] qemu-img: rebase: Reuse in-chain BlockDriverState If a chain was detected, don't open a new BlockBackend from the target backing file which will create a new BlockDriverState. Instead, create an empty BlockBackend and attach the already open BlockDriverState. Permissions for blk_new() were copied from blk_new_open() when flags = 0. Reviewed-by: Karl Heubaum Reviewed-by: Eyal Moscovici Signed-off-by: Sagi Amit Co-developed-by: Sagi Amit Signed-off-by: Sam Eiderman Message-id: 20190523163337.4497-4-shmuel.eiderman@oracle.com Signed-off-by: Max Reitz --- qemu-img.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 2d96a491e2..b0535919b1 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3358,16 +3358,29 @@ static int img_rebase(int argc, char **argv) * in its chain. */ prefix_chain_bs = bdrv_find_backing_image(bs, out_real_path); - - blk_new_backing = blk_new_open(out_real_path, NULL, - options, src_flags, &local_err); - g_free(out_real_path); - if (!blk_new_backing) { - error_reportf_err(local_err, - "Could not open new backing file '%s': ", - out_baseimg); - ret = -1; - goto out; + if (prefix_chain_bs) { + g_free(out_real_path); + blk_new_backing = blk_new(BLK_PERM_CONSISTENT_READ, + BLK_PERM_ALL); + ret = blk_insert_bs(blk_new_backing, prefix_chain_bs, + &local_err); + if (ret < 0) { + error_reportf_err(local_err, + "Could not reuse backing file '%s': ", + out_baseimg); + goto out; + } + } else { + blk_new_backing = blk_new_open(out_real_path, NULL, + options, src_flags, &local_err); + g_free(out_real_path); + if (!blk_new_backing) { + error_reportf_err(local_err, + "Could not open new backing file '%s': ", + out_baseimg); + ret = -1; + goto out; + } } } } From c8bb23cbdbe32f5c326365e0a82e1b0e68cdcd8a Mon Sep 17 00:00:00 2001 From: Anton Nefedov Date: Thu, 16 May 2019 17:27:49 +0300 Subject: [PATCH 19/21] qcow2: skip writing zero buffers to empty COW areas If COW areas of the newly allocated clusters are zeroes on the backing image, efficient bdrv_write_zeroes(flags=BDRV_REQ_NO_FALLBACK) can be used on the whole cluster instead of writing explicit zero buffers later in perform_cow(). iotest 060: write to the discarded cluster does not trigger COW anymore. Use a backing image instead. Signed-off-by: Anton Nefedov Message-id: 20190516142749.81019-2-anton.nefedov@virtuozzo.com Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Signed-off-by: Max Reitz --- block/qcow2-cluster.c | 2 +- block/qcow2.c | 85 ++++++++++++++++++++++++++++++++++++++ block/qcow2.h | 6 +++ block/trace-events | 1 + qapi/block-core.json | 4 +- tests/qemu-iotests/060 | 7 +++- tests/qemu-iotests/060.out | 5 ++- 7 files changed, 106 insertions(+), 4 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4a929900cf..cf892f37a8 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -831,7 +831,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m) assert(start->offset + start->nb_bytes <= end->offset); assert(!m->data_qiov || m->data_qiov->size == data_bytes); - if (start->nb_bytes == 0 && end->nb_bytes == 0) { + if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) { return 0; } diff --git a/block/qcow2.c b/block/qcow2.c index dea765b2f4..f2cb131048 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2103,6 +2103,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes, continue; } + /* If COW regions are handled already, skip this too */ + if (m->skip_cow) { + continue; + } + /* The data (middle) region must be immediately after the * start region */ if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) { @@ -2128,6 +2133,80 @@ static bool merge_cow(uint64_t offset, unsigned bytes, return false; } +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes) +{ + int64_t nr; + return !bytes || + (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes); +} + +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m) +{ + /* + * This check is designed for optimization shortcut so it must be + * efficient. + * Instead of is_zero(), use is_unallocated() as it is faster (but not + * as accurate and can result in false negatives). + */ + return is_unallocated(bs, m->offset + m->cow_start.offset, + m->cow_start.nb_bytes) && + is_unallocated(bs, m->offset + m->cow_end.offset, + m->cow_end.nb_bytes); +} + +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) +{ + BDRVQcow2State *s = bs->opaque; + QCowL2Meta *m; + + if (!(s->data_file->bs->supported_zero_flags & BDRV_REQ_NO_FALLBACK)) { + return 0; + } + + if (bs->encrypted) { + return 0; + } + + for (m = l2meta; m != NULL; m = m->next) { + int ret; + + if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) { + continue; + } + + if (!is_zero_cow(bs, m)) { + continue; + } + + /* + * instead of writing zero COW buffers, + * efficiently zero out the whole clusters + */ + + ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset, + m->nb_clusters * s->cluster_size, + true); + if (ret < 0) { + return ret; + } + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE); + ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset, + m->nb_clusters * s->cluster_size, + BDRV_REQ_NO_FALLBACK); + if (ret < 0) { + if (ret != -ENOTSUP && ret != -EAGAIN) { + return ret; + } + continue; + } + + trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters); + m->skip_cow = true; + } + return 0; +} + static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) @@ -2207,6 +2286,12 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes); } + /* Try to efficiently initialize the physical space with zeroes */ + ret = handle_alloc_space(bs, l2meta); + if (ret < 0) { + goto out_unlocked; + } + /* If we need to do COW, check if it's possible to merge the * writing of the guest data together with that of the COW regions. * If it's not possible (or not necessary) then write the diff --git a/block/qcow2.h b/block/qcow2.h index e873ae5d12..567375e56c 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -405,6 +405,12 @@ typedef struct QCowL2Meta */ Qcow2COWRegion cow_end; + /* + * Indicates that COW regions are already handled and do not require + * any more processing. + */ + bool skip_cow; + /** * The I/O vector with the data from the actual guest write request. * If non-NULL, this is meant to be merged together with the data diff --git a/block/trace-events b/block/trace-events index 79ccd8d824..1e0653ce6d 100644 --- a/block/trace-events +++ b/block/trace-events @@ -68,6 +68,7 @@ qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d" qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64 qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d" qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d" +qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d" # qcow2-cluster.c qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" diff --git a/qapi/block-core.json b/qapi/block-core.json index 7ccbfff9d0..3e4042be7f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3215,6 +3215,8 @@ # # @cor_write: a write due to copy-on-read (since 2.11) # +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.1) +# # Since: 2.9 ## { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG', @@ -3233,7 +3235,7 @@ 'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev', 'pwritev_zero', 'pwritev_done', 'empty_image_prepare', 'l1_shrink_write_table', 'l1_shrink_free_l2_clusters', - 'cor_write'] } + 'cor_write', 'cluster_alloc_space'] } ## # @BlkdebugInjectErrorOptions: diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060 index 89e911400c..b91d8321bb 100755 --- a/tests/qemu-iotests/060 +++ b/tests/qemu-iotests/060 @@ -150,10 +150,15 @@ $QEMU_IO -c "$OPEN_RO" -c "read -P 1 0 512" | _filter_qemu_io echo echo "=== Testing overlap while COW is in flight ===" echo +BACKING_IMG=$TEST_IMG.base +TEST_IMG=$BACKING_IMG _make_test_img 1G + +$QEMU_IO -c 'write 0k 64k' "$BACKING_IMG" | _filter_qemu_io + # compat=0.10 is required in order to make the following discard actually # unallocate the sector rather than make it a zero sector - we want COW, after # all. -IMGOPTS='compat=0.10' _make_test_img 1G +IMGOPTS='compat=0.10' _make_test_img -b "$BACKING_IMG" 1G # Write two clusters, the second one enforces creation of an L2 table after # the first data cluster. $QEMU_IO -c 'write 0k 64k' -c 'write 512M 64k' "$TEST_IMG" | _filter_qemu_io diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out index e42bf8c5a9..0f6b0658a1 100644 --- a/tests/qemu-iotests/060.out +++ b/tests/qemu-iotests/060.out @@ -97,7 +97,10 @@ read 512/512 bytes at offset 0 === Testing overlap while COW is in flight === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1073741824 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 backing_file=TEST_DIR/t.IMGFMT.base wrote 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 65536/65536 bytes at offset 536870912 From 6388903e7cc9b6528934a083e81f086322c24c86 Mon Sep 17 00:00:00 2001 From: Andrey Shinkevich Date: Mon, 27 May 2019 15:52:01 +0300 Subject: [PATCH 20/21] qcow2-bitmap: initialize bitmap directory alignment Valgrind detects multiple issues in QEMU iotests when the memory is used without being initialized. Valgrind may dump lots of unnecessary reports what makes the memory issue analysis harder. Particularly, that is true for the aligned bitmap directory and can be seen while running the iotest #169. Padding the aligned space with zeros eases the pain. Signed-off-by: Andrey Shinkevich Message-id: 1558961521-131620-1-git-send-email-andrey.shinkevich@virtuozzo.com Signed-off-by: Max Reitz --- block/qcow2-bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c index 640da68ce1..b2487101ed 100644 --- a/block/qcow2-bitmap.c +++ b/block/qcow2-bitmap.c @@ -753,7 +753,7 @@ static int bitmap_list_store(BlockDriverState *bs, Qcow2BitmapList *bm_list, dir_offset = *offset; } - dir = g_try_malloc(dir_size); + dir = g_try_malloc0(dir_size); if (dir == NULL) { return -ENOMEM; } From a2d665c1bc3624a8375e2f9a7d569f7565cc1358 Mon Sep 17 00:00:00 2001 From: John Snow Date: Tue, 21 May 2019 17:00:53 -0400 Subject: [PATCH 21/21] blockdev: loosen restrictions on drive-backup source node We mandate that the source node must be a root node; but there's no reason I am aware of that it needs to be restricted to such. In some cases, we need to make sure that there's a medium present, but in the general case we can allow the backup job itself to do the graph checking. This patch helps improve the error message when you try to backup from the same node more than once, which is reflected in the change to test 056. For backups with bitmaps, it will also show a better error message that the bitmap is in use instead of giving you something cryptic like "need a root node." Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1707303 Signed-off-by: John Snow Message-id: 20190521210053.8864-1-jsnow@redhat.com Signed-off-by: Max Reitz --- blockdev.c | 7 ++++++- tests/qemu-iotests/056 | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index 79fbac8450..bc339d7818 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3450,11 +3450,16 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, backup->compress = false; } - bs = qmp_get_root_bs(backup->device, errp); + bs = bdrv_lookup_bs(backup->device, backup->device, errp); if (!bs) { return NULL; } + if (!bs->drv) { + error_setg(errp, "Device has no medium"); + return NULL; + } + aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); diff --git a/tests/qemu-iotests/056 b/tests/qemu-iotests/056 index 3df323984d..f40fc11a09 100755 --- a/tests/qemu-iotests/056 +++ b/tests/qemu-iotests/056 @@ -214,7 +214,7 @@ class BackupTest(iotests.QMPTestCase): res = self.vm.qmp('query-block-jobs') self.assert_qmp(res, 'return[0]/status', 'concluded') # Leave zombie job un-dismissed, observe a failure: - res = self.qmp_backup_and_wait(serror='Need a root block node', + res = self.qmp_backup_and_wait(serror="Node 'drive0' is busy: block device is in use by block job: backup", device='drive0', format=iotests.imgfmt, sync='full', target=self.dest_img, auto_dismiss=False)