diff --git a/block/Makefile.objs b/block/Makefile.objs index d644bac60a..899bfb5e2c 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -26,7 +26,7 @@ block-obj-y += accounting.o dirty-bitmap.o block-obj-y += write-threshold.o block-obj-y += backup.o block-obj-$(CONFIG_REPLICATION) += replication.o -block-obj-y += throttle.o +block-obj-y += throttle.o copy-on-read.o block-obj-y += crypto.o diff --git a/block/backup.c b/block/backup.c index 453cd62c24..e14d99560d 100644 --- a/block/backup.c +++ b/block/backup.c @@ -27,7 +27,6 @@ #include "qemu/error-report.h" #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) -#define SLICE_TIME 100000000ULL /* ns */ typedef struct BackupBlockJob { BlockJob common; @@ -35,10 +34,10 @@ typedef struct BackupBlockJob { /* bitmap for sync=incremental */ BdrvDirtyBitmap *sync_bitmap; MirrorSyncMode sync_mode; - RateLimit limit; BlockdevOnError on_source_error; BlockdevOnError on_target_error; CoRwlock flush_rwlock; + uint64_t len; uint64_t bytes_read; int64_t cluster_size; bool compress; @@ -48,6 +47,8 @@ typedef struct BackupBlockJob { HBitmap *copy_bitmap; } BackupBlockJob; +static const BlockJobDriver backup_job_driver; + /* See if in-flight requests overlap and wait for them to complete */ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, int64_t start, @@ -118,7 +119,7 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job, trace_backup_do_cow_process(job, start); - n = MIN(job->cluster_size, job->common.len - start); + n = MIN(job->cluster_size, job->len - start); if (!bounce_buffer) { bounce_buffer = blk_blockalign(blk, job->cluster_size); @@ -159,7 +160,7 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job, * offset field is an opaque progress value, it is not a disk offset. */ job->bytes_read += n; - job->common.offset += n; + block_job_progress_update(&job->common, n); } out: @@ -190,17 +191,6 @@ static int coroutine_fn backup_before_write_notify( return backup_do_cow(job, req->offset, req->bytes, NULL, true); } -static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - BackupBlockJob *s = container_of(job, BackupBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed, SLICE_TIME); -} - static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) { BdrvDirtyBitmap *bm; @@ -253,7 +243,7 @@ void backup_do_checkpoint(BlockJob *job, Error **errp) BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); int64_t len; - assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP); + assert(block_job_driver(job) == &backup_job_driver); if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) { error_setg(errp, "The backup job only supports block checkpoint in" @@ -261,7 +251,7 @@ void backup_do_checkpoint(BlockJob *job, Error **errp) return; } - len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size); + len = DIV_ROUND_UP(backup_job->len, backup_job->cluster_size); hbitmap_set(backup_job->copy_bitmap, 0, len); } @@ -271,7 +261,7 @@ void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset, BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); int64_t start, end; - assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP); + assert(block_job_driver(job) == &backup_job_driver); start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); @@ -284,7 +274,7 @@ void backup_cow_request_begin(CowRequest *req, BlockJob *job, BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); int64_t start, end; - assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP); + assert(block_job_driver(job) == &backup_job_driver); start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); @@ -337,21 +327,17 @@ static void backup_complete(BlockJob *job, void *opaque) static bool coroutine_fn yield_and_check(BackupBlockJob *job) { + uint64_t delay_ns; + if (block_job_is_cancelled(&job->common)) { return true; } - /* we need to yield so that bdrv_drain_all() returns. - * (without, VM does not reboot) - */ - if (job->common.speed) { - uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, - job->bytes_read); - job->bytes_read = 0; - block_job_sleep_ns(&job->common, delay_ns); - } else { - block_job_sleep_ns(&job->common, 0); - } + /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can + * return. Without a yield, the VM would not reboot. */ + delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read); + job->bytes_read = 0; + block_job_sleep_ns(&job->common, delay_ns); if (block_job_is_cancelled(&job->common)) { return true; @@ -420,8 +406,9 @@ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job) bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size); } - job->common.offset = job->common.len - - hbitmap_count(job->copy_bitmap) * job->cluster_size; + /* TODO block_job_progress_set_remaining() would make more sense */ + block_job_progress_update(&job->common, + job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size); bdrv_dirty_iter_free(dbi); } @@ -437,7 +424,9 @@ static void coroutine_fn backup_run(void *opaque) QLIST_INIT(&job->inflight_reqs); qemu_co_rwlock_init(&job->flush_rwlock); - nb_clusters = DIV_ROUND_UP(job->common.len, job->cluster_size); + nb_clusters = DIV_ROUND_UP(job->len, job->cluster_size); + block_job_progress_set_remaining(&job->common, job->len); + job->copy_bitmap = hbitmap_alloc(nb_clusters, 0); if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { backup_incremental_init_copy_bitmap(job); @@ -461,7 +450,7 @@ static void coroutine_fn backup_run(void *opaque) ret = backup_run_incremental(job); } else { /* Both FULL and TOP SYNC_MODE's require copying.. */ - for (offset = 0; offset < job->common.len; + for (offset = 0; offset < job->len; offset += job->cluster_size) { bool error_is_read; int alloced = 0; @@ -537,7 +526,6 @@ static const BlockJobDriver backup_job_driver = { .instance_size = sizeof(BackupBlockJob), .job_type = BLOCK_JOB_TYPE_BACKUP, .start = backup_run, - .set_speed = backup_set_speed, .commit = backup_commit, .abort = backup_abort, .clean = backup_clean, @@ -620,7 +608,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, goto error; } - /* job->common.len is fixed, so we can't allow resize */ + /* job->len is fixed, so we can't allow resize */ job = block_job_create(job_id, &backup_job_driver, txn, bs, BLK_PERM_CONSISTENT_READ, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | @@ -676,7 +664,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, /* Required permissions are already taken with target's blk_new() */ block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL, &error_abort); - job->common.len = len; + job->len = len; return &job->common; diff --git a/block/blkdebug.c b/block/blkdebug.c index 053372c22e..526af2a808 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -398,10 +398,11 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, goto out; } - bs->supported_write_flags = BDRV_REQ_FUA & - bs->file->bs->supported_write_flags; - bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & - bs->file->bs->supported_zero_flags; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags); ret = -EINVAL; /* Set alignment overrides */ diff --git a/block/blkreplay.c b/block/blkreplay.c index fe5a9b4a98..b016dbeee7 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -35,6 +35,9 @@ static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; + ret = 0; fail: return ret; diff --git a/block/blkverify.c b/block/blkverify.c index 754cc9e857..da97ee5927 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -141,6 +141,9 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; + ret = 0; fail: qemu_opts_del(opts); diff --git a/block/block-backend.c b/block/block-backend.c index 681b240b12..89f47b00ea 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1865,13 +1865,7 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason) AioContext *blk_get_aio_context(BlockBackend *blk) { - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - return bdrv_get_aio_context(bs); - } else { - return qemu_get_aio_context(); - } + return bdrv_get_aio_context(blk_bs(blk)); } static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb) diff --git a/block/commit.c b/block/commit.c index 1432baeef4..ba5df6aa0a 100644 --- a/block/commit.c +++ b/block/commit.c @@ -31,11 +31,8 @@ enum { COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */ }; -#define SLICE_TIME 100000000ULL /* ns */ - typedef struct CommitBlockJob { BlockJob common; - RateLimit limit; BlockDriverState *commit_top_bs; BlockBackend *top; BlockBackend *base; @@ -146,21 +143,21 @@ static void coroutine_fn commit_run(void *opaque) int64_t n = 0; /* bytes */ void *buf = NULL; int bytes_written = 0; - int64_t base_len; + int64_t len, base_len; - ret = s->common.len = blk_getlength(s->top); - - if (s->common.len < 0) { + ret = len = blk_getlength(s->top); + if (len < 0) { goto out; } + block_job_progress_set_remaining(&s->common, len); ret = base_len = blk_getlength(s->base); if (base_len < 0) { goto out; } - if (base_len < s->common.len) { - ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL); + if (base_len < len) { + ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL); if (ret) { goto out; } @@ -168,7 +165,7 @@ static void coroutine_fn commit_run(void *opaque) buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE); - for (offset = 0; offset < s->common.len; offset += n) { + for (offset = 0; offset < len; offset += n) { bool copy; /* Note that even when no rate limit is applied we need to yield @@ -198,10 +195,10 @@ static void coroutine_fn commit_run(void *opaque) } } /* Publish progress */ - s->common.offset += n; + block_job_progress_update(&s->common, n); - if (copy && s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, n); + if (copy) { + delay_ns = block_job_ratelimit_get_delay(&s->common, n); } else { delay_ns = 0; } @@ -217,21 +214,9 @@ out: block_job_defer_to_main_loop(&s->common, commit_complete, data); } -static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - CommitBlockJob *s = container_of(job, CommitBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed, SLICE_TIME); -} - static const BlockJobDriver commit_job_driver = { .instance_size = sizeof(CommitBlockJob), .job_type = BLOCK_JOB_TYPE_COMMIT, - .set_speed = commit_set_speed, .start = commit_run, }; diff --git a/block/copy-on-read.c b/block/copy-on-read.c new file mode 100644 index 0000000000..6a97208888 --- /dev/null +++ b/block/copy-on-read.c @@ -0,0 +1,173 @@ +/* + * Copy-on-read filter block driver + * + * Copyright (c) 2018 Red Hat, Inc. + * + * Author: + * Max Reitz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 or + * (at your option) version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include "block/block_int.h" + + +static int cor_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false, + errp); + if (!bs->file) { + return -EINVAL; + } + + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & + bs->file->bs->supported_write_flags); + + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags); + + return 0; +} + + +static void cor_close(BlockDriverState *bs) +{ +} + + +#define PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \ + | BLK_PERM_WRITE \ + | BLK_PERM_RESIZE) +#define PERM_UNCHANGED (BLK_PERM_ALL & ~PERM_PASSTHROUGH) + +static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, + const BdrvChildRole *role, + BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared) +{ + if (c == NULL) { + *nperm = (perm & PERM_PASSTHROUGH) | BLK_PERM_WRITE_UNCHANGED; + *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; + return; + } + + *nperm = (perm & PERM_PASSTHROUGH) | + (c->perm & PERM_UNCHANGED); + *nshared = (shared & PERM_PASSTHROUGH) | + (c->shared_perm & PERM_UNCHANGED); +} + + +static int64_t cor_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file->bs); +} + + +static int cor_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) +{ + return bdrv_truncate(bs->file, offset, prealloc, errp); +} + + +static int coroutine_fn cor_co_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + return bdrv_co_preadv(bs->file, offset, bytes, qiov, + flags | BDRV_REQ_COPY_ON_READ); +} + + +static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +} + + +static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int bytes, + BdrvRequestFlags flags) +{ + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); +} + + +static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, + int64_t offset, int bytes) +{ + return bdrv_co_pdiscard(bs->file->bs, offset, bytes); +} + + +static void cor_eject(BlockDriverState *bs, bool eject_flag) +{ + bdrv_eject(bs->file->bs, eject_flag); +} + + +static void cor_lock_medium(BlockDriverState *bs, bool locked) +{ + bdrv_lock_medium(bs->file->bs, locked); +} + + +static bool cor_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); +} + + +BlockDriver bdrv_copy_on_read = { + .format_name = "copy-on-read", + + .bdrv_open = cor_open, + .bdrv_close = cor_close, + .bdrv_child_perm = cor_child_perm, + + .bdrv_getlength = cor_getlength, + .bdrv_truncate = cor_truncate, + + .bdrv_co_preadv = cor_co_preadv, + .bdrv_co_pwritev = cor_co_pwritev, + .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, + .bdrv_co_pdiscard = cor_co_pdiscard, + + .bdrv_eject = cor_eject, + .bdrv_lock_medium = cor_lock_medium, + + .bdrv_co_block_status = bdrv_co_block_status_from_file, + + .bdrv_recurse_is_first_non_filter = cor_recurse_is_first_non_filter, + + .has_variable_length = true, + .is_filter = true, +}; + +static void bdrv_copy_on_read_init(void) +{ + bdrv_register(&bdrv_copy_on_read); +} + +block_init(bdrv_copy_on_read_init); diff --git a/block/file-win32.c b/block/file-win32.c index 2e2f746bb1..3c67db4336 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -251,7 +251,11 @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp) &dg.Geometry.BytesPerSector, &freeClusters, &totalClusters); bs->bl.request_alignment = dg.Geometry.BytesPerSector; + return; } + + /* XXX Does Windows support AIO on less than 512-byte alignment? */ + bs->bl.request_alignment = 512; } static void raw_parse_flags(int flags, bool use_aio, int *access_flags, @@ -410,32 +414,32 @@ fail: return ret; } -static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static BlockAIOCB *raw_aio_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, + BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; if (s->aio) { - return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_READ); + return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov, + cb, opaque, QEMU_AIO_READ); } else { - return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov, - nb_sectors << BDRV_SECTOR_BITS, + return paio_submit(bs, s->hfile, offset, qiov, bytes, cb, opaque, QEMU_AIO_READ); } } -static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static BlockAIOCB *raw_aio_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, + BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; if (s->aio) { - return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_WRITE); + return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov, + cb, opaque, QEMU_AIO_WRITE); } else { - return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov, - nb_sectors << BDRV_SECTOR_BITS, + return paio_submit(bs, s->hfile, offset, qiov, bytes, cb, opaque, QEMU_AIO_WRITE); } } @@ -632,8 +636,8 @@ BlockDriver bdrv_file = { .bdrv_co_create_opts = raw_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_preadv = raw_aio_preadv, + .bdrv_aio_pwritev = raw_aio_pwritev, .bdrv_aio_flush = raw_aio_flush, .bdrv_truncate = raw_truncate, @@ -708,6 +712,12 @@ static void hdev_parse_filename(const char *filename, QDict *options, bdrv_parse_filename_strip_prefix(filename, "host_device:", options); } +static void hdev_refresh_limits(BlockDriverState *bs, Error **errp) +{ + /* XXX Does Windows support AIO on less than 512-byte alignment? */ + bs->bl.request_alignment = 512; +} + static int hdev_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -793,9 +803,10 @@ static BlockDriver bdrv_host_device = { .bdrv_probe_device = hdev_probe_device, .bdrv_file_open = hdev_open, .bdrv_close = raw_close, + .bdrv_refresh_limits = hdev_refresh_limits, - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_preadv = raw_aio_preadv, + .bdrv_aio_pwritev = raw_aio_pwritev, .bdrv_aio_flush = raw_aio_flush, .bdrv_detach_aio_context = raw_detach_aio_context, diff --git a/block/gluster.c b/block/gluster.c index 55be566f6d..9900b6420c 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1194,8 +1194,10 @@ static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) + QEMUIOVector *qiov, + int flags) { + assert(!flags); return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } diff --git a/block/io.c b/block/io.c index bd9a19a9c4..ca96b487eb 100644 --- a/block/io.c +++ b/block/io.c @@ -92,7 +92,8 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) } /* Default alignment based on whether driver has byte interface */ - bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; + bs->bl.request_alignment = (drv->bdrv_co_preadv || + drv->bdrv_aio_preadv) ? 1 : 512; /* Take some limits from the children as a default */ if (bs->file) { @@ -924,23 +925,14 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); } - sector_num = offset >> BDRV_SECTOR_BITS; - nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); - - if (drv->bdrv_co_readv) { - return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else { + if (drv->bdrv_aio_preadv) { BlockAIOCB *acb; CoroutineIOCompletion co = { .coroutine = qemu_coroutine_self(), }; - acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, - bdrv_co_io_em_complete, &co); + acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, + bdrv_co_io_em_complete, &co); if (acb == NULL) { return -EIO; } else { @@ -948,6 +940,16 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, return co.ret; } } + + sector_num = offset >> BDRV_SECTOR_BITS; + nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); + assert(drv->bdrv_co_readv); + + return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); } static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, @@ -972,6 +974,25 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, goto emulate_flags; } + if (drv->bdrv_aio_pwritev) { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, + flags & bs->supported_write_flags, + bdrv_co_io_em_complete, &co); + flags &= ~bs->supported_write_flags; + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + goto emulate_flags; + } + sector_num = offset >> BDRV_SECTOR_BITS; nb_sectors = bytes >> BDRV_SECTOR_BITS; @@ -979,28 +1000,10 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); - if (drv->bdrv_co_writev_flags) { - ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; - } else if (drv->bdrv_co_writev) { - assert(!bs->supported_write_flags); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } else { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - ret = -EIO; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } + assert(drv->bdrv_co_writev); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, + flags & bs->supported_write_flags); + flags &= ~bs->supported_write_flags; emulate_flags: if (ret == 0 && (flags & BDRV_REQ_FUA)) { @@ -1115,13 +1118,15 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, /* FIXME: Should we (perhaps conditionally) be setting * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy * that still correctly reads as zero? */ - ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0); + ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, + BDRV_REQ_WRITE_UNCHANGED); } else { /* This does not change the data on the disk, it is not * necessary to flush even in cache=writethrough mode. */ ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, - &local_qiov, 0); + &local_qiov, + BDRV_REQ_WRITE_UNCHANGED); } if (ret < 0) { @@ -1501,7 +1506,11 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, assert(!waited || !req->serialising); assert(req->overlap_offset <= offset); assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - assert(child->perm & BLK_PERM_WRITE); + if (flags & BDRV_REQ_WRITE_UNCHANGED) { + assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); + } else { + assert(child->perm & BLK_PERM_WRITE); + } assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); diff --git a/block/iscsi.c b/block/iscsi.c index d19ae0e398..3fd7203916 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -556,8 +556,8 @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun, } static int coroutine_fn -iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *iov, int flags) +iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *iov, int flags) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; @@ -2220,7 +2220,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_co_pdiscard = iscsi_co_pdiscard, .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, - .bdrv_co_writev_flags = iscsi_co_writev_flags, + .bdrv_co_writev = iscsi_co_writev, .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ @@ -2255,7 +2255,7 @@ static BlockDriver bdrv_iser = { .bdrv_co_pdiscard = iscsi_co_pdiscard, .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, - .bdrv_co_writev_flags = iscsi_co_writev_flags, + .bdrv_co_writev = iscsi_co_writev, .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ diff --git a/block/mirror.c b/block/mirror.c index 99da9c0858..a4197bb975 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -22,7 +22,6 @@ #include "qemu/ratelimit.h" #include "qemu/bitmap.h" -#define SLICE_TIME 100000000ULL /* ns */ #define MAX_IN_FLIGHT 16 #define MAX_IO_BYTES (1 << 20) /* 1 Mb */ #define DEFAULT_MIRROR_BUF_SIZE (MAX_IN_FLIGHT * MAX_IO_BYTES) @@ -36,7 +35,6 @@ typedef struct MirrorBuffer { typedef struct MirrorBlockJob { BlockJob common; - RateLimit limit; BlockBackend *target; BlockDriverState *mirror_top_bs; BlockDriverState *source; @@ -121,7 +119,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); } if (!s->initial_zeroing_ongoing) { - s->common.offset += op->bytes; + block_job_progress_update(&s->common, op->bytes); } } qemu_iovec_destroy(&op->qiov); @@ -449,9 +447,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) assert(io_bytes); offset += io_bytes; nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity); - if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, io_bytes_acct); - } + delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct); } return delay_ns; } @@ -596,7 +592,7 @@ static void mirror_throttle(MirrorBlockJob *s) { int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - if (now - s->last_pause_ns > SLICE_TIME) { + if (now - s->last_pause_ns > BLOCK_JOB_SLICE_TIME) { s->last_pause_ns = now; block_job_sleep_ns(&s->common, 0); } else { @@ -792,19 +788,17 @@ static void coroutine_fn mirror_run(void *opaque) block_job_pause_point(&s->common); cnt = bdrv_get_dirty_count(s->dirty_bitmap); - /* s->common.offset contains the number of bytes already processed so - * far, cnt is the number of dirty bytes remaining and - * s->bytes_in_flight is the number of bytes currently being - * processed; together those are the current total operation length */ - s->common.len = s->common.offset + s->bytes_in_flight + cnt; + /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is + * the number of bytes currently being processed; together those are + * the current remaining operation length */ + block_job_progress_set_remaining(&s->common, s->bytes_in_flight + cnt); /* Note that even when no rate limit is applied we need to yield * periodically with no pending I/O so that bdrv_drain_all() returns. - * We do so every SLICE_TIME nanoseconds, or when there is an error, - * or when the source is clean, whichever comes first. - */ + * We do so every BLKOCK_JOB_SLICE_TIME nanoseconds, or when there is + * an error, or when the source is clean, whichever comes first. */ delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns; - if (delta < SLICE_TIME && + if (delta < BLOCK_JOB_SLICE_TIME && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { @@ -870,7 +864,8 @@ static void coroutine_fn mirror_run(void *opaque) ret = 0; if (s->synced && !should_complete) { - delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); + delay_ns = (s->in_flight == 0 && + cnt == 0 ? BLOCK_JOB_SLICE_TIME : 0); } trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); block_job_sleep_ns(&s->common, delay_ns); @@ -909,17 +904,6 @@ immediate_exit: block_job_defer_to_main_loop(&s->common, mirror_exit, data); } -static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed, SLICE_TIME); -} - static void mirror_complete(BlockJob *job, Error **errp) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); @@ -1004,7 +988,6 @@ static void mirror_drain(BlockJob *job) static const BlockJobDriver mirror_job_driver = { .instance_size = sizeof(MirrorBlockJob), .job_type = BLOCK_JOB_TYPE_MIRROR, - .set_speed = mirror_set_speed, .start = mirror_run, .complete = mirror_complete, .pause = mirror_pause, @@ -1015,7 +998,6 @@ static const BlockJobDriver mirror_job_driver = { static const BlockJobDriver commit_active_job_driver = { .instance_size = sizeof(MirrorBlockJob), .job_type = BLOCK_JOB_TYPE_COMMIT, - .set_speed = mirror_set_speed, .start = mirror_run, .complete = mirror_complete, .pause = mirror_pause, @@ -1152,6 +1134,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, mirror_top_bs->implicit = true; } mirror_top_bs->total_sectors = bs->total_sectors; + mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs)); /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep diff --git a/block/null.c b/block/null.c index 3944550f67..5d610fdfba 100644 --- a/block/null.c +++ b/block/null.c @@ -93,6 +93,7 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags, } s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false); qemu_opts_del(opts); + bs->supported_write_flags = BDRV_REQ_FUA; return ret; } @@ -116,22 +117,22 @@ static coroutine_fn int null_co_common(BlockDriverState *bs) return 0; } -static coroutine_fn int null_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) +static coroutine_fn int null_co_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVNullState *s = bs->opaque; if (s->read_zeroes) { - qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE); + qemu_iovec_memset(qiov, 0, 0, bytes); } return null_co_common(bs); } -static coroutine_fn int null_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) +static coroutine_fn int null_co_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { return null_co_common(bs); } @@ -186,26 +187,26 @@ static inline BlockAIOCB *null_aio_common(BlockDriverState *bs, return &acb->common; } -static BlockAIOCB *null_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static BlockAIOCB *null_aio_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, + BlockCompletionFunc *cb, + void *opaque) { BDRVNullState *s = bs->opaque; if (s->read_zeroes) { - qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE); + qemu_iovec_memset(qiov, 0, 0, bytes); } return null_aio_common(bs, cb, opaque); } -static BlockAIOCB *null_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static BlockAIOCB *null_aio_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, + BlockCompletionFunc *cb, + void *opaque) { return null_aio_common(bs, cb, opaque); } @@ -265,8 +266,8 @@ static BlockDriver bdrv_null_co = { .bdrv_close = null_close, .bdrv_getlength = null_getlength, - .bdrv_co_readv = null_co_readv, - .bdrv_co_writev = null_co_writev, + .bdrv_co_preadv = null_co_preadv, + .bdrv_co_pwritev = null_co_pwritev, .bdrv_co_flush_to_disk = null_co_flush, .bdrv_reopen_prepare = null_reopen_prepare, @@ -285,8 +286,8 @@ static BlockDriver bdrv_null_aio = { .bdrv_close = null_close, .bdrv_getlength = null_getlength, - .bdrv_aio_readv = null_aio_readv, - .bdrv_aio_writev = null_aio_writev, + .bdrv_aio_preadv = null_aio_preadv, + .bdrv_aio_pwritev = null_aio_pwritev, .bdrv_aio_flush = null_aio_flush, .bdrv_reopen_prepare = null_reopen_prepare, diff --git a/block/parallels.c b/block/parallels.c index 045810d00f..6e9c37f44e 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -311,13 +311,15 @@ static int coroutine_fn parallels_co_block_status(BlockDriverState *bs, } static coroutine_fn int parallels_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov, int flags) { BDRVParallelsState *s = bs->opaque; uint64_t bytes_done = 0; QEMUIOVector hd_qiov; int ret = 0; + assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); while (nb_sectors > 0) { diff --git a/block/qcow.c b/block/qcow.c index 4b2f7db74c..3ba2ca25ea 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -720,7 +720,8 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, } static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) + int nb_sectors, QEMUIOVector *qiov, + int flags) { BDRVQcowState *s = bs->opaque; int index_in_cluster; @@ -731,6 +732,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, uint8_t *buf; void *orig_buf; + assert(!flags); s->cluster_cache_offset = -1; /* disable compressed cache */ /* We must always copy the iov when encrypting, so we @@ -1110,7 +1112,7 @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, if (ret != Z_STREAM_END || out_len >= s->cluster_size) { /* could not compress: write normal cluster */ ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS, - bytes >> BDRV_SECTOR_BITS, qiov); + bytes >> BDRV_SECTOR_BITS, qiov, 0); if (ret < 0) { goto fail; } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 6b8b63514a..2dc23005b7 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1577,9 +1577,9 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, case QCOW2_CLUSTER_COMPRESSED: /* Compressed clusters don't have QCOW_OFLAG_COPIED */ if (l2_entry & QCOW_OFLAG_COPIED) { - fprintf(stderr, "ERROR: cluster %" PRId64 ": " + fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": " "copied flag must never be set for compressed " - "clusters\n", l2_entry >> s->cluster_bits); + "clusters\n", l2_entry & s->cluster_offset_mask); l2_entry &= ~QCOW_OFLAG_COPIED; res->corruptions++; } diff --git a/block/qcow2.c b/block/qcow2.c index 2f36e632f9..6d532470a8 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -802,23 +802,30 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, } else if (refcount_cache_size_set) { *l2_cache_size = combined_cache_size - *refcount_cache_size; } else { - *refcount_cache_size = combined_cache_size - / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1); - *l2_cache_size = combined_cache_size - *refcount_cache_size; + uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8); + uint64_t min_refcount_cache = + (uint64_t) MIN_REFCOUNT_CACHE_SIZE * s->cluster_size; + + /* Assign as much memory as possible to the L2 cache, and + * use the remainder for the refcount cache */ + if (combined_cache_size >= max_l2_cache + min_refcount_cache) { + *l2_cache_size = max_l2_cache; + *refcount_cache_size = combined_cache_size - *l2_cache_size; + } else { + *refcount_cache_size = + MIN(combined_cache_size, min_refcount_cache); + *l2_cache_size = combined_cache_size - *refcount_cache_size; + } } } else { - if (!l2_cache_size_set && !refcount_cache_size_set) { + if (!l2_cache_size_set) { *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE, (uint64_t)DEFAULT_L2_CACHE_CLUSTERS * s->cluster_size); - *refcount_cache_size = *l2_cache_size - / DEFAULT_L2_REFCOUNT_SIZE_RATIO; - } else if (!l2_cache_size_set) { - *l2_cache_size = *refcount_cache_size - * DEFAULT_L2_REFCOUNT_SIZE_RATIO; - } else if (!refcount_cache_size_set) { - *refcount_cache_size = *l2_cache_size - / DEFAULT_L2_REFCOUNT_SIZE_RATIO; + } + if (!refcount_cache_size_set) { + *refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size; } } diff --git a/block/qcow2.h b/block/qcow2.h index adf5c3950f..01b5250415 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -77,10 +77,6 @@ #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */ #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */ -/* The refblock cache needs only a fourth of the L2 cache size to cover as many - * clusters */ -#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4 - #define DEFAULT_CLUSTER_SIZE 65536 diff --git a/block/qed.c b/block/qed.c index 1db8eaf241..65cfe92393 100644 --- a/block/qed.c +++ b/block/qed.c @@ -1437,8 +1437,9 @@ static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs, static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) + QEMUIOVector *qiov, int flags) { + assert(!flags); return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE); } diff --git a/block/quorum.c b/block/quorum.c index a5051da56e..e448d7e384 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -115,6 +115,7 @@ struct QuorumAIOCB { /* Request metadata */ uint64_t offset; uint64_t bytes; + int flags; QEMUIOVector *qiov; /* calling IOV */ @@ -157,7 +158,8 @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs, QEMUIOVector *qiov, uint64_t offset, - uint64_t bytes) + uint64_t bytes, + int flags) { BDRVQuorumState *s = bs->opaque; QuorumAIOCB *acb = g_new(QuorumAIOCB, 1); @@ -168,6 +170,7 @@ static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs, .bs = bs, .offset = offset, .bytes = bytes, + .flags = flags, .qiov = qiov, .votes.compare = quorum_sha256_compare, .votes.vote_list = QLIST_HEAD_INITIALIZER(acb.votes.vote_list), @@ -271,9 +274,11 @@ static void quorum_rewrite_entry(void *opaque) BDRVQuorumState *s = acb->bs->opaque; /* Ignore any errors, it's just a correction attempt for already - * corrupted data. */ + * corrupted data. + * Mask out BDRV_REQ_WRITE_UNCHANGED because this overwrites the + * area with different data from the other children. */ bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes, - acb->qiov, 0); + acb->qiov, acb->flags & ~BDRV_REQ_WRITE_UNCHANGED); /* Wake up the caller after the last rewrite */ acb->rewrite_count--; @@ -673,7 +678,7 @@ static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags); int ret; acb->is_read = true; @@ -699,7 +704,7 @@ static void write_quorum_entry(void *opaque) sacb->bs = s->children[i]->bs; sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, - acb->qiov, 0); + acb->qiov, acb->flags); if (sacb->ret == 0) { acb->success_count++; } else { @@ -719,7 +724,7 @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags); int i, ret; for (i = 0; i < s->num_children; i++) { @@ -961,6 +966,8 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, } s->next_child_index = s->num_children; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + g_free(opened); goto exit; diff --git a/block/raw-format.c b/block/raw-format.c index a378547c99..fe33693a2d 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -415,10 +415,11 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, } bs->sg = bs->file->bs->sg; - bs->supported_write_flags = BDRV_REQ_FUA & - bs->file->bs->supported_write_flags; - bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & - bs->file->bs->supported_zero_flags; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags); if (bs->probed && !bdrv_is_read_only(bs)) { fprintf(stderr, diff --git a/block/rbd.c b/block/rbd.c index a14b42fcde..a16431e267 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -231,6 +231,13 @@ done: } +static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp) +{ + /* XXX Does RBD support AIO on less than 512-byte alignment? */ + bs->bl.request_alignment = 512; +} + + static int qemu_rbd_set_auth(rados_t cluster, const char *secretid, Error **errp) { @@ -899,27 +906,23 @@ failed: return NULL; } -static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov, - (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque, - RBD_AIO_READ); -} - -static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, +static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, BlockCompletionFunc *cb, void *opaque) { - return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov, - (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque, + return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque, + RBD_AIO_READ); +} + +static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, + BlockCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque, RBD_AIO_WRITE); } @@ -1158,6 +1161,7 @@ static BlockDriver bdrv_rbd = { .format_name = "rbd", .instance_size = sizeof(BDRVRBDState), .bdrv_parse_filename = qemu_rbd_parse_filename, + .bdrv_refresh_limits = qemu_rbd_refresh_limits, .bdrv_file_open = qemu_rbd_open, .bdrv_close = qemu_rbd_close, .bdrv_reopen_prepare = qemu_rbd_reopen_prepare, @@ -1170,8 +1174,8 @@ static BlockDriver bdrv_rbd = { .bdrv_truncate = qemu_rbd_truncate, .protocol_name = "rbd", - .bdrv_aio_readv = qemu_rbd_aio_readv, - .bdrv_aio_writev = qemu_rbd_aio_writev, + .bdrv_aio_preadv = qemu_rbd_aio_preadv, + .bdrv_aio_pwritev = qemu_rbd_aio_pwritev, #ifdef LIBRBD_SUPPORTS_AIO_FLUSH .bdrv_aio_flush = qemu_rbd_aio_flush, diff --git a/block/replication.c b/block/replication.c index 6c0c7186d9..48148b884a 100644 --- a/block/replication.c +++ b/block/replication.c @@ -260,7 +260,8 @@ out: static coroutine_fn int replication_co_writev(BlockDriverState *bs, int64_t sector_num, int remaining_sectors, - QEMUIOVector *qiov) + QEMUIOVector *qiov, + int flags) { BDRVReplicationState *s = bs->opaque; QEMUIOVector hd_qiov; @@ -271,6 +272,7 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, int ret; int64_t n; + assert(!flags); ret = replication_get_io_status(s); if (ret < 0) { goto out; diff --git a/block/sheepdog.c b/block/sheepdog.c index fed2a04797..4237132419 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -2614,13 +2614,15 @@ static void sd_aio_complete(SheepdogAIOCB *acb) } static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) + int nb_sectors, QEMUIOVector *qiov, + int flags) { SheepdogAIOCB acb; int ret; int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; BDRVSheepdogState *s = bs->opaque; + assert(!flags); if (offset > s->inode.vdi_size) { ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL); if (ret < 0) { diff --git a/block/ssh.c b/block/ssh.c index 412a1bfc17..4c4fa3ccfc 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -1164,11 +1164,13 @@ static int ssh_write(BDRVSSHState *s, BlockDriverState *bs, static coroutine_fn int ssh_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) + int nb_sectors, QEMUIOVector *qiov, + int flags) { BDRVSSHState *s = bs->opaque; int ret; + assert(!flags); qemu_co_mutex_lock(&s->lock); ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, qiov); diff --git a/block/stream.c b/block/stream.c index 1a85708fcf..df9660d2fc 100644 --- a/block/stream.c +++ b/block/stream.c @@ -29,11 +29,8 @@ enum { STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ }; -#define SLICE_TIME 100000000ULL /* ns */ - typedef struct StreamBlockJob { BlockJob common; - RateLimit limit; BlockDriverState *base; BlockdevOnError on_error; char *backing_file_str; @@ -107,6 +104,7 @@ static void coroutine_fn stream_run(void *opaque) BlockBackend *blk = s->common.blk; BlockDriverState *bs = blk_bs(blk); BlockDriverState *base = s->base; + int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; int error = 0; @@ -118,11 +116,12 @@ static void coroutine_fn stream_run(void *opaque) goto out; } - s->common.len = bdrv_getlength(bs); - if (s->common.len < 0) { - ret = s->common.len; + len = bdrv_getlength(bs); + if (len < 0) { + ret = len; goto out; } + block_job_progress_set_remaining(&s->common, len); buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); @@ -135,7 +134,7 @@ static void coroutine_fn stream_run(void *opaque) bdrv_enable_copy_on_read(bs); } - for ( ; offset < s->common.len; offset += n) { + for ( ; offset < len; offset += n) { bool copy; /* Note that even when no rate limit is applied we need to yield @@ -159,7 +158,7 @@ static void coroutine_fn stream_run(void *opaque) /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { - n = s->common.len - offset; + n = len - offset; } copy = (ret == 1); @@ -185,9 +184,9 @@ static void coroutine_fn stream_run(void *opaque) ret = 0; /* Publish progress */ - s->common.offset += n; - if (copy && s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, n); + block_job_progress_update(&s->common, n); + if (copy) { + delay_ns = block_job_ratelimit_get_delay(&s->common, n); } else { delay_ns = 0; } @@ -209,21 +208,9 @@ out: block_job_defer_to_main_loop(&s->common, stream_complete, data); } -static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - StreamBlockJob *s = container_of(job, StreamBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed, SLICE_TIME); -} - static const BlockJobDriver stream_job_driver = { .instance_size = sizeof(StreamBlockJob), .job_type = BLOCK_JOB_TYPE_STREAM, - .set_speed = stream_set_speed, .start = stream_run, }; diff --git a/block/throttle.c b/block/throttle.c index 95ed06acd8..e298827f95 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -81,8 +81,10 @@ static int throttle_open(BlockDriverState *bs, QDict *options, if (!bs->file) { return -EINVAL; } - bs->supported_write_flags = bs->file->bs->supported_write_flags; - bs->supported_zero_flags = bs->file->bs->supported_zero_flags; + bs->supported_write_flags = bs->file->bs->supported_write_flags | + BDRV_REQ_WRITE_UNCHANGED; + bs->supported_zero_flags = bs->file->bs->supported_zero_flags | + BDRV_REQ_WRITE_UNCHANGED; return throttle_configure_tgm(bs, tgm, options, errp); } diff --git a/block/vhdx.c b/block/vhdx.c index c3a4220a35..0b1e21c750 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1226,7 +1226,8 @@ int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s) } static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) + int nb_sectors, QEMUIOVector *qiov, + int flags) { int ret = -ENOTSUP; BDRVVHDXState *s = bs->opaque; @@ -1242,6 +1243,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, uint64_t bat_prior_offset = 0; bool bat_update = false; + assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); qemu_co_mutex_lock(&s->lock); diff --git a/block/vxhs.c b/block/vxhs.c index 55ae1a666e..339e23218d 100644 --- a/block/vxhs.c +++ b/block/vxhs.c @@ -216,6 +216,12 @@ static void vxhs_parse_filename(const char *filename, QDict *options, } } +static void vxhs_refresh_limits(BlockDriverState *bs, Error **errp) +{ + /* XXX Does VXHS support AIO on less than 512-byte alignment? */ + bs->bl.request_alignment = 512; +} + static int vxhs_init_and_ref(void) { if (vxhs_ref++ == 0) { @@ -424,21 +430,17 @@ static const AIOCBInfo vxhs_aiocb_info = { * and is passed to QNIO. When QNIO completes the work, * it will be passed back through the callback. */ -static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, uint64_t offset, + QEMUIOVector *qiov, uint64_t size, BlockCompletionFunc *cb, void *opaque, VDISKAIOCmd iodir) { VXHSAIOCB *acb = NULL; BDRVVXHSState *s = bs->opaque; - size_t size; - uint64_t offset; int iio_flags = 0; int ret = 0; void *dev_handle = s->vdisk_hostinfo.dev_handle; - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); /* @@ -451,11 +453,11 @@ static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num, switch (iodir) { case VDISK_AIO_WRITE: ret = iio_writev(dev_handle, acb, qiov->iov, qiov->niov, - offset, (uint64_t)size, iio_flags); + offset, size, iio_flags); break; case VDISK_AIO_READ: ret = iio_readv(dev_handle, acb, qiov->iov, qiov->niov, - offset, (uint64_t)size, iio_flags); + offset, size, iio_flags); break; default: trace_vxhs_aio_rw_invalid(iodir); @@ -474,22 +476,20 @@ errout: return NULL; } -static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, +static BlockAIOCB *vxhs_aio_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, BlockCompletionFunc *cb, void *opaque) { - return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb, - opaque, VDISK_AIO_READ); + return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_READ); } -static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static BlockAIOCB *vxhs_aio_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, + BlockCompletionFunc *cb, void *opaque) { - return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, - cb, opaque, VDISK_AIO_WRITE); + return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_WRITE); } static void vxhs_close(BlockDriverState *bs) @@ -561,10 +561,11 @@ static BlockDriver bdrv_vxhs = { .instance_size = sizeof(BDRVVXHSState), .bdrv_file_open = vxhs_open, .bdrv_parse_filename = vxhs_parse_filename, + .bdrv_refresh_limits = vxhs_refresh_limits, .bdrv_close = vxhs_close, .bdrv_getlength = vxhs_getlength, - .bdrv_aio_readv = vxhs_aio_readv, - .bdrv_aio_writev = vxhs_aio_writev, + .bdrv_aio_preadv = vxhs_aio_preadv, + .bdrv_aio_pwritev = vxhs_aio_pwritev, }; static void bdrv_vxhs_init(void) diff --git a/block/win32-aio.c b/block/win32-aio.c index 3be8f458fa..9cd355d42f 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -112,15 +112,14 @@ static const AIOCBInfo win32_aiocb_info = { BlockAIOCB *win32_aio_submit(BlockDriverState *bs, QEMUWin32AIOState *aio, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, BlockCompletionFunc *cb, void *opaque, int type) { struct QEMUWin32AIOCB *waiocb; - uint64_t offset = sector_num * 512; DWORD rc; waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque); - waiocb->nbytes = nb_sectors * 512; + waiocb->nbytes = bytes; waiocb->qiov = qiov; waiocb->is_read = (type == QEMU_AIO_READ); diff --git a/blockjob.c b/blockjob.c index dfffad921a..36c5fdeb2f 100644 --- a/blockjob.c +++ b/blockjob.c @@ -359,6 +359,11 @@ static bool block_job_started(BlockJob *job) return job->co; } +const BlockJobDriver *block_job_driver(BlockJob *job) +{ + return job->driver; +} + /** * All jobs must allow a pause point before entering their job proper. This * ensures that jobs can be paused prior to being started, then resumed later. @@ -659,22 +664,18 @@ static bool block_job_timer_pending(BlockJob *job) void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) { - Error *local_err = NULL; int64_t old_speed = job->speed; - if (!job->driver->set_speed) { - error_setg(errp, QERR_UNSUPPORTED); - return; - } if (block_job_apply_verb(job, BLOCK_JOB_VERB_SET_SPEED, errp)) { return; } - job->driver->set_speed(job, speed, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (speed < 0) { + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); return; } + ratelimit_set_speed(&job->limit, speed, BLOCK_JOB_SLICE_TIME); + job->speed = speed; if (speed && speed <= old_speed) { return; @@ -684,6 +685,15 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) block_job_enter_cond(job, block_job_timer_pending); } +int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n) +{ + if (!job->speed) { + return 0; + } + + return ratelimit_calculate_delay(&job->limit, n); +} + void block_job_complete(BlockJob *job, Error **errp) { /* Should not be reachable via external interface for internal jobs */ @@ -702,7 +712,7 @@ void block_job_complete(BlockJob *job, Error **errp) void block_job_finalize(BlockJob *job, Error **errp) { - assert(job && job->id && job->txn); + assert(job && job->id); if (block_job_apply_verb(job, BLOCK_JOB_VERB_FINALIZE, errp)) { return; } @@ -810,6 +820,16 @@ int block_job_complete_sync(BlockJob *job, Error **errp) return block_job_finish_sync(job, &block_job_complete, errp); } +void block_job_progress_update(BlockJob *job, uint64_t done) +{ + job->offset += done; +} + +void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining) +{ + job->len = job->offset + remaining; +} + BlockJobInfo *block_job_query(BlockJob *job, Error **errp) { BlockJobInfo *info; @@ -831,6 +851,8 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp) info->status = job->status; info->auto_finalize = job->auto_finalize; info->auto_dismiss = job->auto_dismiss; + info->has_error = job->ret != 0; + info->error = job->ret ? g_strdup(strerror(-job->ret)) : NULL; return info; } diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index feb711fb6a..8e1547ded2 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -400,10 +400,10 @@ L2 table entry: 62: 0 for standard clusters 1 for compressed clusters - 63: 0 for a cluster that is unused or requires COW, 1 if its - refcount is exactly one. This information is only accurate - in L2 tables that are reachable from the active L1 - table. + 63: 0 for clusters that are unused, compressed or require COW. + 1 for standard clusters whose refcount is exactly one. + This information is only accurate in L2 tables + that are reachable from the active L1 table. Standard Cluster Descriptor: diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt index 170191a242..8a09a5cc5f 100644 --- a/docs/qcow2-cache.txt +++ b/docs/qcow2-cache.txt @@ -116,31 +116,30 @@ There are three options available, and all of them take bytes: "refcount-cache-size": maximum size of the refcount block cache "cache-size": maximum size of both caches combined -There are two things that need to be taken into account: +There are a few things that need to be taken into account: - Both caches must have a size that is a multiple of the cluster size (or the cache entry size: see "Using smaller cache sizes" below). - - If you only set one of the options above, QEMU will automatically - adjust the others so that the L2 cache is 4 times bigger than the - refcount cache. + - The default L2 cache size is 8 clusters or 1MB (whichever is more), + and the minimum is 2 clusters (or 2 cache entries, see below). -This means that these options are equivalent: + - The default (and minimum) refcount cache size is 4 clusters. - -drive file=hd.qcow2,l2-cache-size=2097152 - -drive file=hd.qcow2,refcount-cache-size=524288 - -drive file=hd.qcow2,cache-size=2621440 + - If only "cache-size" is specified then QEMU will assign as much + memory as possible to the L2 cache before increasing the refcount + cache size. -The reason for this 1/4 ratio is to ensure that both caches cover the -same amount of disk space. Note however that this is only valid with -the default value of refcount_bits (16). If you are using a different -value you might want to calculate both cache sizes yourself since QEMU -will always use the same 1/4 ratio. +Unlike L2 tables, refcount blocks are not used during normal I/O but +only during allocations and internal snapshots. In most cases they are +accessed sequentially (even during random guest I/O) so increasing the +refcount cache size won't have any measurable effect in performance +(this can change if you are using internal snapshots, so you may want +to think about increasing the cache size if you use them heavily). -It's also worth mentioning that there's no strict need for both caches -to cover the same amount of disk space. The refcount cache is used -much less often than the L2 cache, so it's perfectly reasonable to -keep it small. +Before QEMU 2.12 the refcount cache had a default size of 1/4 of the +L2 cache size. This resulted in unnecessarily large caches, so now the +refcount cache is as small as possible unless overridden by the user. Using smaller cache entries diff --git a/hmp-commands.hx b/hmp-commands.hx index 35d862a5d2..227f7eee88 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1647,7 +1647,8 @@ ETEXI STEXI @item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr} @findex block_set_io_throttle -Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr} +Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}. +@var{device} can be a block device name, a qdev ID or a QOM path. ETEXI { diff --git a/hmp.c b/hmp.c index 898e25f3e1..bdb340605c 100644 --- a/hmp.c +++ b/hmp.c @@ -1789,9 +1789,8 @@ void hmp_change(Monitor *mon, const QDict *qdict) void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict) { Error *err = NULL; + char *device = (char *) qdict_get_str(qdict, "device"); BlockIOThrottle throttle = { - .has_device = true, - .device = (char *) qdict_get_str(qdict, "device"), .bps = qdict_get_int(qdict, "bps"), .bps_rd = qdict_get_int(qdict, "bps_rd"), .bps_wr = qdict_get_int(qdict, "bps_wr"), @@ -1800,6 +1799,17 @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict) .iops_wr = qdict_get_int(qdict, "iops_wr"), }; + /* qmp_block_set_io_throttle has separate parameters for the + * (deprecated) block device name and the qdev ID but the HMP + * version has only one, so we must decide which one to pass. */ + if (blk_by_name(device)) { + throttle.has_device = true; + throttle.device = device; + } else { + throttle.has_id = true; + throttle.id = device; + } + qmp_block_set_io_throttle(&throttle, &err); hmp_handle_error(mon, &err); } diff --git a/include/block/block.h b/include/block/block.h index cdec3639a3..3894edda9d 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -54,8 +54,12 @@ typedef enum { BDRV_REQ_FUA = 0x10, BDRV_REQ_WRITE_COMPRESSED = 0x20, + /* Signifies that this write request will not change the visible disk + * content. */ + BDRV_REQ_WRITE_UNCHANGED = 0x40, + /* Mask of valid flags */ - BDRV_REQ_MASK = 0x3f, + BDRV_REQ_MASK = 0x7f, } BdrvRequestFlags; typedef struct BlockSizes { @@ -205,6 +209,9 @@ enum { * This permission (which is weaker than BLK_PERM_WRITE) is both enough and * required for writes to the block node when the caller promises that * the visible disk content doesn't change. + * + * As the BLK_PERM_WRITE permission is strictly stronger, either is + * sufficient to perform an unchanging write. */ BLK_PERM_WRITE_UNCHANGED = 0x04, diff --git a/include/block/block_int.h b/include/block/block_int.h index c4dd1d4bb8..76b589da57 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -141,11 +141,11 @@ struct BlockDriver { void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options); /* aio */ - BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags, BlockCompletionFunc *cb, void *opaque); - BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque); @@ -174,8 +174,6 @@ struct BlockDriver { int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); - int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags); /** * @offset: position in bytes to write at @@ -658,10 +656,24 @@ struct BlockDriverState { /* I/O Limits */ BlockLimits bl; - /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */ + /* Flags honored during pwrite (so far: BDRV_REQ_FUA, + * BDRV_REQ_WRITE_UNCHANGED). + * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those + * writes will be issued as normal writes without the flag set. + * This is important to note for drivers that do not explicitly + * request a WRITE permission for their children and instead take + * the same permissions as their parent did (this is commonly what + * block filters do). Such drivers have to be aware that the + * parent may have taken a WRITE_UNCHANGED permission only and is + * issuing such requests. Drivers either must make sure that + * these requests do not result in plain WRITE accesses (usually + * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding + * every incoming write request as-is, including potentially that + * flag), or they have to explicitly take the WRITE permission for + * their children. */ unsigned int supported_write_flags; /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, - * BDRV_REQ_MAY_UNMAP) */ + * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */ unsigned int supported_zero_flags; /* the following member gives a name to every node on the bs graph. */ diff --git a/include/block/blockjob.h b/include/block/blockjob.h index fc645dac68..0f56f723de 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -27,6 +27,9 @@ #define BLOCKJOB_H #include "block/block.h" +#include "qemu/ratelimit.h" + +#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */ typedef struct BlockJobDriver BlockJobDriver; typedef struct BlockJobTxn BlockJobTxn; @@ -118,6 +121,9 @@ typedef struct BlockJob { /** Speed that was set with @block_job_set_speed. */ int64_t speed; + /** Rate limiting data structure for implementing @speed. */ + RateLimit limit; + /** The completion function that will be called when the job completes. */ BlockCompletionFunc *cb; @@ -277,6 +283,25 @@ void block_job_finalize(BlockJob *job, Error **errp); */ void block_job_dismiss(BlockJob **job, Error **errp); +/** + * block_job_progress_update: + * @job: The job that has made progress + * @done: How much progress the job made + * + * Updates the progress counter of the job. + */ +void block_job_progress_update(BlockJob *job, uint64_t done); + +/** + * block_job_progress_set_remaining: + * @job: The job whose expected progress end value is set + * @remaining: Expected end value of the progress counter of the job + * + * Sets the expected end value of the progress counter of a job so that a + * completion percentage can be calculated when the progress is updated. + */ +void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining); + /** * block_job_query: * @job: The job to get information about. @@ -427,4 +452,11 @@ void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job); */ bool block_job_is_internal(BlockJob *job); +/** + * block_job_driver: + * + * Returns the driver associated with a block job. + */ +const BlockJobDriver *block_job_driver(BlockJob *job); + #endif diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index d5a515de9b..62ec964d09 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -41,9 +41,6 @@ struct BlockJobDriver { /** String describing the operation, part of query-block-jobs QMP API */ BlockJobType job_type; - /** Optional callback for job types that support setting a speed limit */ - void (*set_speed)(BlockJob *job, int64_t speed, Error **errp); - /** Mandatory: Entrypoint for the Coroutine. */ CoroutineEntry *start; @@ -168,6 +165,14 @@ void block_job_sleep_ns(BlockJob *job, int64_t ns); */ void block_job_yield(BlockJob *job); +/** + * block_job_ratelimit_get_delay: + * + * Calculate and return delay for the next request in ns. See the documentation + * of ratelimit_calculate_delay() for details. + */ +int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n); + /** * block_job_early_fail: * @bs: The block device. diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index a4cdbbf1b7..9e47b8a629 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -57,7 +57,7 @@ void win32_aio_cleanup(QEMUWin32AIOState *aio); int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile); BlockAIOCB *win32_aio_submit(BlockDriverState *bs, QEMUWin32AIOState *aio, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, BlockCompletionFunc *cb, void *opaque, int type); void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, AioContext *old_context); diff --git a/qapi/block-core.json b/qapi/block-core.json index 21c3470234..55728cb823 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1172,6 +1172,9 @@ # @auto-dismiss: Job will dismiss itself when CONCLUDED, moving to the NULL # state and disappearing from the query list. (since 2.12) # +# @error: Error information if the job did not complete successfully. +# Not set if the job completed successfully. (since 2.12.1) +# # Since: 1.1 ## { 'struct': 'BlockJobInfo', @@ -1179,7 +1182,8 @@ 'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int', 'io-status': 'BlockDeviceIoStatus', 'ready': 'bool', 'status': 'BlockJobStatus', - 'auto-finalize': 'bool', 'auto-dismiss': 'bool' } } + 'auto-finalize': 'bool', 'auto-dismiss': 'bool', + '*error': 'str' } } ## # @query-block-jobs: @@ -2506,11 +2510,12 @@ # @vxhs: Since 2.10 # @throttle: Since 2.11 # @nvme: Since 2.12 +# @copy-on-read: Since 2.13 # # Since: 2.9 ## { 'enum': 'BlockdevDriver', - 'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', + 'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed', @@ -3527,6 +3532,7 @@ 'blkverify': 'BlockdevOptionsBlkverify', 'bochs': 'BlockdevOptionsGenericFormat', 'cloop': 'BlockdevOptionsGenericFormat', + 'copy-on-read':'BlockdevOptionsGenericFormat', 'dmg': 'BlockdevOptionsGenericFormat', 'file': 'BlockdevOptionsFile', 'ftp': 'BlockdevOptionsCurlFtp', @@ -4054,6 +4060,7 @@ 'blkverify': 'BlockdevCreateNotSupported', 'bochs': 'BlockdevCreateNotSupported', 'cloop': 'BlockdevCreateNotSupported', + 'copy-on-read': 'BlockdevCreateNotSupported', 'dmg': 'BlockdevCreateNotSupported', 'file': 'BlockdevCreateOptionsFile', 'ftp': 'BlockdevCreateNotSupported', diff --git a/qemu-img.c b/qemu-img.c index ea62d2d61e..60e45ec103 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -277,12 +277,12 @@ static BlockBackend *img_open_opts(const char *optstr, options = qemu_opts_to_qdict(opts, NULL); if (force_share) { if (qdict_haskey(options, BDRV_OPT_FORCE_SHARE) - && !qdict_get_bool(options, BDRV_OPT_FORCE_SHARE)) { + && strcmp(qdict_get_str(options, BDRV_OPT_FORCE_SHARE), "on")) { error_report("--force-share/-U conflicts with image options"); qobject_unref(options); return NULL; } - qdict_put_bool(options, BDRV_OPT_FORCE_SHARE, true); + qdict_put_str(options, BDRV_OPT_FORCE_SHARE, "on"); } blk = blk_new_open(NULL, NULL, options, flags, &local_err); if (!blk) { @@ -3381,7 +3381,7 @@ static int img_resize(int argc, char **argv) Error *err = NULL; int c, ret, relative; const char *filename, *fmt, *size; - int64_t n, total_size, current_size; + int64_t n, total_size, current_size, new_size; bool quiet = false; BlockBackend *blk = NULL; PreallocMode prealloc = PREALLOC_MODE_OFF; @@ -3557,11 +3557,42 @@ static int img_resize(int argc, char **argv) } ret = blk_truncate(blk, total_size, prealloc, &err); - if (!ret) { - qprintf(quiet, "Image resized.\n"); - } else { + if (ret < 0) { error_report_err(err); + goto out; } + + new_size = blk_getlength(blk); + if (new_size < 0) { + error_report("Failed to verify truncated image length: %s", + strerror(-new_size)); + ret = -1; + goto out; + } + + /* Some block drivers implement a truncation method, but only so + * the user can cause qemu to refresh the image's size from disk. + * The idea is that the user resizes the image outside of qemu and + * then invokes block_resize to inform qemu about it. + * (This includes iscsi and file-posix for device files.) + * Of course, that is not the behavior someone invoking + * qemu-img resize would find useful, so we catch that behavior + * here and tell the user. */ + if (new_size != total_size && new_size == current_size) { + error_report("Image was not resized; resizing may not be supported " + "for this image"); + ret = -1; + goto out; + } + + if (new_size != total_size) { + warn_report("Image should have been resized to %" PRIi64 + " bytes, but was resized to %" PRIi64 " bytes", + total_size, new_size); + } + + qprintf(quiet, "Image resized.\n"); + out: blk_unref(blk); if (ret) { diff --git a/qemu-io.c b/qemu-io.c index 72fee0d8b7..73c638ff8b 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -95,12 +95,12 @@ static int openfile(char *name, int flags, bool writethrough, bool force_share, opts = qdict_new(); } if (qdict_haskey(opts, BDRV_OPT_FORCE_SHARE) - && !qdict_get_bool(opts, BDRV_OPT_FORCE_SHARE)) { + && strcmp(qdict_get_str(opts, BDRV_OPT_FORCE_SHARE), "on")) { error_report("-U conflicts with image options"); qobject_unref(opts); return 1; } - qdict_put_bool(opts, BDRV_OPT_FORCE_SHARE, true); + qdict_put_str(opts, BDRV_OPT_FORCE_SHARE, "on"); } qemuio_blk = blk_new_open(name, NULL, opts, flags, &local_err); if (!qemuio_blk) { diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122 index 6cf4fcb866..45b359c2ba 100755 --- a/tests/qemu-iotests/122 +++ b/tests/qemu-iotests/122 @@ -129,53 +129,6 @@ $QEMU_IO -c "read -P 0x44 1023k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _fil $QEMU_IO -c "read -P 0 1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -echo -echo "=== Corrupted size field in compressed cluster descriptor ===" -echo -# Create an empty image and fill half of it with compressed data. -# The L2 entries of the two compressed clusters are located at -# 0x800000 and 0x800008, their original values are 0x4008000000a00000 -# and 0x4008000000a00802 (5 sectors for compressed data each). -_make_test_img 8M -o cluster_size=2M -$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \ - 2>&1 | _filter_qemu_io | _filter_testdir - -# Reduce size of compressed data to 4 sectors: this corrupts the image. -poke_file "$TEST_IMG" $((0x800000)) "\x40\x06" -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir - -# 'qemu-img check' however doesn't see anything wrong because it -# doesn't try to decompress the data and the refcounts are consistent. -# TODO: update qemu-img so this can be detected. -_check_test_img - -# Increase size of compressed data to the maximum (8192 sectors). -# This makes QEMU read more data (8192 sectors instead of 5, host -# addresses [0xa00000, 0xdfffff]), but the decompression algorithm -# stops once we have enough to restore the uncompressed cluster, so -# the rest of the data is ignored. -poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe" -# Do it also for the second compressed cluster (L2 entry at 0x800008). -# In this case the compressed data would span 3 host clusters -# (host addresses: [0xa00802, 0xe00801]) -poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe" - -# Here the image is too small so we're asking QEMU to read beyond the -# end of the image. -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -# But if we grow the image we won't be reading beyond its end anymore. -$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir - -# The refcount data is however wrong because due to the increased size -# of the compressed data it now reaches the following host clusters. -# This can be repaired by qemu-img check by increasing the refcount of -# those clusters. -# TODO: update qemu-img to correct the compressed cluster size instead. -_check_test_img -r all -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -$QEMU_IO -c "read -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir - echo echo "=== Full allocation with -S 0 ===" echo diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out index a6b7fe007e..47d8656db8 100644 --- a/tests/qemu-iotests/122.out +++ b/tests/qemu-iotests/122.out @@ -99,39 +99,6 @@ read 1024/1024 bytes at offset 1047552 read 1046528/1046528 bytes at offset 1048576 1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -=== Corrupted size field in compressed cluster descriptor === - -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608 -wrote 2097152/2097152 bytes at offset 0 -2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 2097152/2097152 bytes at offset 2097152 -2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -read failed: Input/output error -No errors were found on the image. -read 4194304/4194304 bytes at offset 0 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 4194304/4194304 bytes at offset 4194304 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -read 4194304/4194304 bytes at offset 0 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -ERROR cluster 6 refcount=1 reference=3 -ERROR cluster 7 refcount=1 reference=2 -Repairing cluster 6 refcount=1 reference=3 -Repairing cluster 7 refcount=1 reference=2 -Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3 -Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2 -The following inconsistencies were found and repaired: - - 0 leaked clusters - 4 corruptions - -Double checking the fixed image now... -No errors were found on the image. -read 4194304/4194304 bytes at offset 0 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -read 4194304/4194304 bytes at offset 4194304 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) - === Full allocation with -S 0 === Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out index e28e1eadba..96724a6c33 100644 --- a/tests/qemu-iotests/137.out +++ b/tests/qemu-iotests/137.out @@ -22,7 +22,7 @@ refcount-cache-size may not exceed cache-size L2 cache size too big L2 cache entry size must be a power of two between 512 and the cluster size (65536) L2 cache entry size must be a power of two between 512 and the cluster size (65536) -L2 cache size too big +Refcount cache size too big Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all') Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all diff --git a/tests/qemu-iotests/153 b/tests/qemu-iotests/153 index a0fd815483..ec508c758f 100755 --- a/tests/qemu-iotests/153 +++ b/tests/qemu-iotests/153 @@ -242,6 +242,23 @@ _run_cmd $QEMU_IO "${TEST_IMG}" -c 'write 0 512' _cleanup_qemu +echo +echo "== Detecting -U and force-share conflicts ==" + +echo +echo 'No conflict:' +$QEMU_IMG info -U --image-opts driver=null-co,force-share=on +echo +echo 'Conflict:' +$QEMU_IMG info -U --image-opts driver=null-co,force-share=off + +echo +echo 'No conflict:' +$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=on' +echo +echo 'Conflict:' +$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=off' + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/153.out b/tests/qemu-iotests/153.out index bb721cb747..2510762ba1 100644 --- a/tests/qemu-iotests/153.out +++ b/tests/qemu-iotests/153.out @@ -399,4 +399,20 @@ Is another process using the image? Closing the other _qemu_io_wrapper TEST_DIR/t.qcow2 -c write 0 512 + +== Detecting -U and force-share conflicts == + +No conflict: +image: null-co:// +file format: null-co +virtual size: 1.0G (1073741824 bytes) +disk size: unavailable + +Conflict: +qemu-img: --force-share/-U conflicts with image options + +No conflict: + +Conflict: +-U conflicts with image options *** done diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181 index 5e767c6195..e02979378d 100755 --- a/tests/qemu-iotests/181 +++ b/tests/qemu-iotests/181 @@ -96,6 +96,19 @@ echo # Enable postcopy-ram capability both on source and destination silent=yes _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)" + +qemu_error_no_exit=yes success_or_failure=yes \ + _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported" +if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then + _send_qemu_cmd $dest '' "(qemu)" + + _send_qemu_cmd $src 'quit' "" + _send_qemu_cmd $dest 'quit' "" + wait=1 _cleanup_qemu + + _notrun 'Postcopy is not supported' +fi + _send_qemu_cmd $src 'migrate_set_speed 4k' "(qemu)" _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)" _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)" diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197 index 5e869fe2b7..3ae4975eec 100755 --- a/tests/qemu-iotests/197 +++ b/tests/qemu-iotests/197 @@ -44,6 +44,7 @@ esac _cleanup() { _cleanup_test_img + rm -f "$TEST_WRAP" rm -f "$BLKDBG_CONF" } trap "_cleanup; exit \$status" 0 1 2 3 15 diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201 index 11f640f5df..c1a1e00077 100755 --- a/tests/qemu-iotests/201 +++ b/tests/qemu-iotests/201 @@ -82,6 +82,19 @@ echo silent=yes _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)" + +qemu_error_no_exit=yes success_or_failure=yes \ + _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported" +if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then + _send_qemu_cmd $dest '' "(qemu)" + + _send_qemu_cmd $src 'quit' "" + _send_qemu_cmd $dest 'quit' "" + wait=1 _cleanup_qemu + + _notrun 'Postcopy is not supported' +fi + _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)" _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)" diff --git a/tests/qemu-iotests/214 b/tests/qemu-iotests/214 new file mode 100755 index 0000000000..c46ca2a6dd --- /dev/null +++ b/tests/qemu-iotests/214 @@ -0,0 +1,97 @@ +#!/bin/bash +# +# Test qcow2 image compression +# +# Copyright (C) 2018 Igalia, S.L. +# Author: Alberto Garcia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq=$(basename "$0") +echo "QA output created by $seq" + +here=$PWD +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + +# Repairing the corrupted image requires qemu-img check to store a +# refcount up to 3, which requires at least two refcount bits. +_unsupported_imgopts 'refcount_bits=1[^0-9]' + + +echo +echo "=== Corrupted size field in compressed cluster descriptor ===" +echo +# Create an empty image and fill half of it with compressed data. +# The L2 entries of the two compressed clusters are located at +# 0x800000 and 0x800008, their original values are 0x4008000000a00000 +# and 0x4008000000a00802 (5 sectors for compressed data each). +_make_test_img 8M -o cluster_size=2M +$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \ + 2>&1 | _filter_qemu_io | _filter_testdir + +# Reduce size of compressed data to 4 sectors: this corrupts the image. +poke_file "$TEST_IMG" $((0x800000)) "\x40\x06" +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# 'qemu-img check' however doesn't see anything wrong because it +# doesn't try to decompress the data and the refcounts are consistent. +# TODO: update qemu-img so this can be detected. +_check_test_img + +# Increase size of compressed data to the maximum (8192 sectors). +# This makes QEMU read more data (8192 sectors instead of 5, host +# addresses [0xa00000, 0xdfffff]), but the decompression algorithm +# stops once we have enough to restore the uncompressed cluster, so +# the rest of the data is ignored. +poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe" +# Do it also for the second compressed cluster (L2 entry at 0x800008). +# In this case the compressed data would span 3 host clusters +# (host addresses: [0xa00802, 0xe00801]) +poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe" + +# Here the image is too small so we're asking QEMU to read beyond the +# end of the image. +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +# But if we grow the image we won't be reading beyond its end anymore. +$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# The refcount data is however wrong because due to the increased size +# of the compressed data it now reaches the following host clusters. +# This can be repaired by qemu-img check by increasing the refcount of +# those clusters. +# TODO: update qemu-img to correct the compressed cluster size instead. +_check_test_img -r all +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# success, all done +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/214.out b/tests/qemu-iotests/214.out new file mode 100644 index 0000000000..0fcd8dc051 --- /dev/null +++ b/tests/qemu-iotests/214.out @@ -0,0 +1,35 @@ +QA output created by 214 + +=== Corrupted size field in compressed cluster descriptor === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608 +wrote 2097152/2097152 bytes at offset 0 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 2097152 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read failed: Input/output error +No errors were found on the image. +read 4194304/4194304 bytes at offset 0 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 4194304/4194304 bytes at offset 4194304 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4194304/4194304 bytes at offset 0 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +ERROR cluster 6 refcount=1 reference=3 +ERROR cluster 7 refcount=1 reference=2 +Repairing cluster 6 refcount=1 reference=3 +Repairing cluster 7 refcount=1 reference=2 +Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3 +Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2 +The following inconsistencies were found and repaired: + + 0 leaked clusters + 4 corruptions + +Double checking the fixed image now... +No errors were found on the image. +read 4194304/4194304 bytes at offset 0 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4194304/4194304 bytes at offset 4194304 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +*** done diff --git a/tests/qemu-iotests/215 b/tests/qemu-iotests/215 new file mode 100755 index 0000000000..2e616ed659 --- /dev/null +++ b/tests/qemu-iotests/215 @@ -0,0 +1,120 @@ +#!/bin/bash +# +# Test case for copy-on-read into qcow2, using the COR filter driver +# +# Copyright (C) 2018 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +status=1 # failure is the default! + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +TEST_WRAP="$TEST_DIR/t.wrap.qcow2" +BLKDBG_CONF="$TEST_DIR/blkdebug.conf" + +# Sanity check: our use of blkdebug fails if $TEST_DIR contains spaces +# or other problems +case "$TEST_DIR" in + *[^-_a-zA-Z0-9/]*) + _notrun "Suspicious TEST_DIR='$TEST_DIR', cowardly refusing to run" ;; +esac + +_cleanup() +{ + _cleanup_test_img + rm -f "$TEST_WRAP" + rm -f "$BLKDBG_CONF" +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# Test is supported for any backing file; but we force qcow2 for our wrapper. +_supported_fmt generic +_supported_proto generic +_supported_os Linux +# LUKS support may be possible, but it complicates things. +_unsupported_fmt luks + +echo +echo '=== Copy-on-read ===' +echo + +# Prep the images +# VPC rounds image sizes to a specific geometry, force a specific size. +if [ "$IMGFMT" = "vpc" ]; then + IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size") +fi +_make_test_img 4G +$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io +IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \ + _make_test_img -F "$IMGFMT" -b "$TEST_IMG" | _filter_img_create +$QEMU_IO -f qcow2 -c "write -z -u 1M 64k" "$TEST_WRAP" | _filter_qemu_io + +# Ensure that a read of two clusters, but where one is already allocated, +# does not re-write the allocated cluster +cat > "$BLKDBG_CONF" <&1 | _filter_qemu_io) +case $output in + *allocate*) + _notrun "Insufficent memory to run test" ;; + *) printf '%s\n' "$output" ;; +esac +$QEMU_IO \ + -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \ + -c "read -P 0 $((3*1024*1024*1024 + 1024)) 1k" \ + | _filter_qemu_io + +# Copy-on-read is incompatible with read-only +$QEMU_IO \ + -c "open -r -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \ + 2>&1 | _filter_testdir + +# Break the backing chain, and show that images are identical, and that +# we properly copied over explicit zeros. +$QEMU_IMG rebase -u -b "" -f qcow2 "$TEST_WRAP" +$QEMU_IO -f qcow2 -c map "$TEST_WRAP" +_check_test_img +$QEMU_IMG compare -f $IMGFMT -F qcow2 "$TEST_IMG" "$TEST_WRAP" + +# success, all done +echo '*** done' +status=0 diff --git a/tests/qemu-iotests/215.out b/tests/qemu-iotests/215.out new file mode 100644 index 0000000000..70b0f5fb19 --- /dev/null +++ b/tests/qemu-iotests/215.out @@ -0,0 +1,26 @@ +QA output created by 215 + +=== Copy-on-read === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296 +wrote 1024/1024 bytes at offset 3221225472 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=4294967296 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT +wrote 65536/65536 bytes at offset 1048576 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 131072/131072 bytes at offset 1048576 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 0/0 bytes at offset 0 +0 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 2147483136/2147483136 bytes at offset 1024 +2 GiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1024/1024 bytes at offset 3221226496 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +can't open device TEST_DIR/t.wrap.qcow2: Block node is read-only +2 GiB (0x80010000) bytes allocated at offset 0 bytes (0x0) +1023.938 MiB (0x3fff0000) bytes not allocated at offset 2 GiB (0x80010000) +64 KiB (0x10000) bytes allocated at offset 3 GiB (0xc0000000) +1023.938 MiB (0x3fff0000) bytes not allocated at offset 3 GiB (0xc0010000) +No errors were found on the image. +Images are identical. +*** done diff --git a/tests/qemu-iotests/216 b/tests/qemu-iotests/216 new file mode 100755 index 0000000000..ca9b47a7fd --- /dev/null +++ b/tests/qemu-iotests/216 @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# +# Copy-on-read tests using a COR filter node +# +# Copyright (C) 2018 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Creator/Owner: Max Reitz + +import iotests +from iotests import log, qemu_img_pipe, qemu_io, filter_qemu_io + +# Need backing file support +iotests.verify_image_format(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk']) +iotests.verify_platform(['linux']) + +log('') +log('=== Copy-on-read across nodes ===') +log('') + +# The old copy-on-read mechanism without a filter node cannot request +# WRITE_UNCHANGED permissions for its child. Therefore it just tries +# to sneak its write by the usual permission system and holds its +# fingers crossed. However, that sneaking does not work so well when +# there is a filter node in the way: That will receive the write +# request and re-issue a new one to its child, which this time is a +# proper write request that will make the permission system cough -- +# unless there is someone at the top (like a guest device) that has +# requested write permissions. +# +# A COR filter node, however, can request the proper permissions for +# its child and therefore is not hit by this issue. + +with iotests.FilePath('base.img') as base_img_path, \ + iotests.FilePath('top.img') as top_img_path, \ + iotests.VM() as vm: + + log('--- Setting up images ---') + log('') + + qemu_img_pipe('create', '-f', iotests.imgfmt, base_img_path, '64M') + + log(filter_qemu_io(qemu_io(base_img_path, '-c', 'write -P 1 0M 1M'))) + + qemu_img_pipe('create', '-f', iotests.imgfmt, '-b', base_img_path, + top_img_path) + + log(filter_qemu_io(qemu_io(top_img_path, '-c', 'write -P 2 1M 1M'))) + + log('') + log('--- Doing COR ---') + log('') + + # Compare with e.g. the following: + # vm.add_drive_raw('if=none,node-name=node0,copy-on-read=on,driver=raw,' \ + # 'file.driver=%s,file.file.filename=%s' % + # (iotests.imgfmt, top_img_path)) + # (Remove the blockdev-add instead.) + # ((Not tested here because it hits an assertion in the permission + # system.)) + + vm.launch() + + log(vm.qmp('blockdev-add', + node_name='node0', + driver='copy-on-read', + file={ + 'driver': 'raw', + 'file': { + 'driver': 'copy-on-read', + 'file': { + 'driver': 'raw', + 'file': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': top_img_path + }, + 'backing': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': base_img_path + } + } + } + } + } + })) + + # Trigger COR + log(vm.qmp('human-monitor-command', + command_line='qemu-io node0 "read 0 64M"')) + + vm.shutdown() + + log('') + log('--- Checking COR result ---') + log('') + + log(filter_qemu_io(qemu_io(base_img_path, '-c', 'discard 0 64M'))) + log(filter_qemu_io(qemu_io(top_img_path, '-c', 'read -P 1 0M 1M'))) + log(filter_qemu_io(qemu_io(top_img_path, '-c', 'read -P 2 1M 1M'))) diff --git a/tests/qemu-iotests/216.out b/tests/qemu-iotests/216.out new file mode 100644 index 0000000000..d3fc590d29 --- /dev/null +++ b/tests/qemu-iotests/216.out @@ -0,0 +1,28 @@ + +=== Copy-on-read across nodes === + +--- Setting up images --- + +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +wrote 1048576/1048576 bytes at offset 1048576 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + + +--- Doing COR --- + +{u'return': {}} +{u'return': u''} + +--- Checking COR result --- + +discard 67108864/67108864 bytes at offset 0 +64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +read 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +read 1048576/1048576 bytes at offset 1048576 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu index 85f66b852c..f285484951 100644 --- a/tests/qemu-iotests/common.qemu +++ b/tests/qemu-iotests/common.qemu @@ -52,11 +52,29 @@ _in_fd=4 # response is not echoed out. # If $mismatch_only is set, only non-matching responses will # be echoed. +# +# If $success_or_failure is set, the meaning of the arguments is +# changed as follows: +# $2: A string to search for in the response; if found, this indicates +# success and ${QEMU_STATUS[$1]} is set to 0. +# $3: A string to search for in the response; if found, this indicates +# failure and the test is either aborted (if $qemu_error_no_exit +# is not set) or ${QEMU_STATUS[$1]} is set to -1 (otherwise). function _timed_wait_for() { local h=${1} shift + if [ -z "${success_or_failure}" ]; then + success_match=${*} + failure_match= + else + success_match=${1} + failure_match=${2} + fi + + timeout=yes + QEMU_STATUS[$h]=0 while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]} do @@ -64,10 +82,18 @@ function _timed_wait_for() echo "${resp}" | _filter_testdir | _filter_qemu \ | _filter_qemu_io | _filter_qmp | _filter_hmp fi - grep -q "${*}" < <(echo "${resp}") + if [ -n "${failure_match}" ]; then + grep -q "${failure_match}" < <(echo "${resp}") + if [ $? -eq 0 ]; then + timeout= + break + fi + fi + grep -q "${success_match}" < <(echo "${resp}") if [ $? -eq 0 ]; then return - elif [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then + fi + if [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then echo "${resp}" | _filter_testdir | _filter_qemu \ | _filter_qemu_io | _filter_qmp | _filter_hmp fi @@ -75,8 +101,12 @@ function _timed_wait_for() done QEMU_STATUS[$h]=-1 if [ -z "${qemu_error_no_exit}" ]; then - echo "Timeout waiting for ${*} on handle ${h}" - exit 1 # Timeout means the test failed + if [ -n "${timeout}" ]; then + echo "Timeout waiting for ${success_match} on handle ${h}" + else + echo "Wrong response matching ${failure_match} on handle ${h}" + fi + exit 1 # Timeout or wrong match mean the test failed fi } @@ -96,6 +126,11 @@ function _timed_wait_for() # If $qemu_error_no_exit is set, then even if the expected response # is not seen, we will not exit. $QEMU_STATUS[$1] will be set it -1 in # that case. +# +# If $success_or_failure is set, then the last two strings are the +# strings the response will be scanned for. The first of the two +# indicates success, the latter indicates failure. Failure is handled +# like a timeout. function _send_qemu_cmd() { local h=${1} @@ -109,14 +144,23 @@ function _send_qemu_cmd() use_error="no" fi # This array element extraction is done to accommodate pathnames with spaces - cmd=${@: 1:${#@}-1} - shift $(($# - 1)) + if [ -z "${success_or_failure}" ]; then + cmd=${@: 1:${#@}-1} + shift $(($# - 1)) + else + cmd=${@: 1:${#@}-2} + shift $(($# - 2)) + fi while [ ${count} -gt 0 ] do echo "${cmd}" >&${QEMU_IN[${h}]} if [ -n "${1}" ]; then - qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" + if [ -z "${success_or_failure}" ]; then + qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" + else + qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" "${2}" + fi if [ ${QEMU_STATUS[$h]} -eq 0 ]; then return fi diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 5daef24020..cc8cd8cc8e 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -212,4 +212,7 @@ 211 rw auto quick 212 rw auto quick 213 rw auto quick +214 rw auto +215 rw auto quick +216 rw auto quick 218 rw auto quick