From d79df2a2ceb3cb0771146587e9a4bfb312577f46 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Mar 2017 18:48:10 +0100 Subject: [PATCH 1/4] blockjob: avoid recursive AioContext locking Streaming or any other block job hangs when performed on a block device that has a non-default iothread. This happens because the AioContext is acquired twice by block_job_defer_to_main_loop_bh and then released only once by BDRV_POLL_WHILE. (Insert rants on recursive mutexes, which unfortunately are a temporary but necessary evil for iothreads at the moment). Luckily, the reason for the double acquisition is simple; the function acquires the AioContext for both the job iothread and the BDS iothread, in case the BDS iothread was changed while the job was running. It is therefore enough to skip the second acquisition when the two AioContexts are one and the same. Signed-off-by: Paolo Bonzini Reviewed-by: Eric Blake Reviewed-by: Jeff Cody Message-id: 1490118490-5597-1-git-send-email-pbonzini@redhat.com Signed-off-by: Jeff Cody --- blockjob.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/blockjob.c b/blockjob.c index 69126af97f..2159df776b 100644 --- a/blockjob.c +++ b/blockjob.c @@ -755,12 +755,16 @@ static void block_job_defer_to_main_loop_bh(void *opaque) /* Fetch BDS AioContext again, in case it has changed */ aio_context = blk_get_aio_context(data->job->blk); - aio_context_acquire(aio_context); + if (aio_context != data->aio_context) { + aio_context_acquire(aio_context); + } data->job->deferred_to_main_loop = false; data->fn(data->job, data->opaque); - aio_context_release(aio_context); + if (aio_context != data->aio_context) { + aio_context_release(aio_context); + } aio_context_release(data->aio_context); From e3796a245ad0efa65ca8d2dc6424562a8fbaeb6a Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 16 Mar 2017 17:23:49 -0400 Subject: [PATCH 2/4] blockjob: add block_job_start_shim The purpose of this shim is to allow us to pause pre-started jobs. The purpose of *that* is to allow us to buffer a pause request that will be able to take effect before the job ever does any work, allowing us to create jobs during a quiescent state (under which they will be automatically paused), then resuming the jobs after the critical section in any order, either: (1) -block_job_start -block_job_resume (via e.g. drained_end) (2) -block_job_resume (via e.g. drained_end) -block_job_start The problem that requires a startup wrapper is the idea that a job must start in the busy=true state only its first time-- all subsequent entries require busy to be false, and the toggling of this state is otherwise handled during existing pause and yield points. The wrapper simply allows us to mandate that a job can "start," set busy to true, then immediately pause only if necessary. We could avoid requiring a wrapper, but all jobs would need to do it, so it's been factored out here. Signed-off-by: John Snow Reviewed-by: Jeff Cody Message-id: 20170316212351.13797-2-jsnow@redhat.com Signed-off-by: Jeff Cody --- blockjob.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/blockjob.c b/blockjob.c index 2159df776b..0e9ed0336d 100644 --- a/blockjob.c +++ b/blockjob.c @@ -250,16 +250,28 @@ static bool block_job_started(BlockJob *job) return job->co; } +/** + * All jobs must allow a pause point before entering their job proper. This + * ensures that jobs can be paused prior to being started, then resumed later. + */ +static void coroutine_fn block_job_co_entry(void *opaque) +{ + BlockJob *job = opaque; + + assert(job && job->driver && job->driver->start); + block_job_pause_point(job); + job->driver->start(job); +} + void block_job_start(BlockJob *job) { assert(job && !block_job_started(job) && job->paused && - !job->busy && job->driver->start); - job->co = qemu_coroutine_create(job->driver->start, job); - if (--job->pause_count == 0) { - job->paused = false; - job->busy = true; - qemu_coroutine_enter(job->co); - } + job->driver && job->driver->start); + job->co = qemu_coroutine_create(block_job_co_entry, job); + job->pause_count--; + job->busy = true; + job->paused = false; + qemu_coroutine_enter(job->co); } void block_job_ref(BlockJob *job) From f4d9cc88ee69a5b04a843424e50f466e36fcad4e Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 16 Mar 2017 17:23:50 -0400 Subject: [PATCH 3/4] block-backend: add drained_begin / drained_end ops Allow block backends to forward drain requests to their devices/users. The initial intended purpose for this patch is to allow BBs to forward requests along to BlockJobs, which will want to pause if their associated BB has entered a drained region. Signed-off-by: John Snow Reviewed-by: Jeff Cody Message-id: 20170316212351.13797-3-jsnow@redhat.com Signed-off-by: Jeff Cody --- block/block-backend.c | 24 ++++++++++++++++++++++-- include/sysemu/block-backend.h | 8 ++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 5742c09c2c..0b6377332c 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -65,6 +65,8 @@ struct BlockBackend { bool allow_write_beyond_eof; NotifierList remove_bs_notifiers, insert_bs_notifiers; + + int quiesce_counter; }; typedef struct BlockBackendAIOCB { @@ -699,12 +701,17 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque) { /* All drivers that use blk_set_dev_ops() are qdevified and we want to keep - * it that way, so we can assume blk->dev is a DeviceState if blk->dev_ops - * is set. */ + * it that way, so we can assume blk->dev, if present, is a DeviceState if + * blk->dev_ops is set. Non-device users may use dev_ops without device. */ assert(!blk->legacy_dev); blk->dev_ops = ops; blk->dev_opaque = opaque; + + /* Are we currently quiesced? Should we enforce this right now? */ + if (blk->quiesce_counter && ops->drained_begin) { + ops->drained_begin(opaque); + } } /* @@ -1870,6 +1877,12 @@ static void blk_root_drained_begin(BdrvChild *child) { BlockBackend *blk = child->opaque; + if (++blk->quiesce_counter == 1) { + if (blk->dev_ops && blk->dev_ops->drained_begin) { + blk->dev_ops->drained_begin(blk->dev_opaque); + } + } + /* Note that blk->root may not be accessible here yet if we are just * attaching to a BlockDriverState that is drained. Use child instead. */ @@ -1881,7 +1894,14 @@ static void blk_root_drained_begin(BdrvChild *child) static void blk_root_drained_end(BdrvChild *child) { BlockBackend *blk = child->opaque; + assert(blk->quiesce_counter); assert(blk->public.io_limits_disabled); --blk->public.io_limits_disabled; + + if (--blk->quiesce_counter == 0) { + if (blk->dev_ops && blk->dev_ops->drained_end) { + blk->dev_ops->drained_end(blk->dev_opaque); + } + } } diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 096c17fce0..7462228ac1 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -58,6 +58,14 @@ typedef struct BlockDevOps { * Runs when the size changed (e.g. monitor command block_resize) */ void (*resize_cb)(void *opaque); + /* + * Runs when the backend receives a drain request. + */ + void (*drained_begin)(void *opaque); + /* + * Runs when the backend's last drain request ends. + */ + void (*drained_end)(void *opaque); } BlockDevOps; /* This struct is embedded in (the private) BlockBackend struct and contains From 600ac6a0ef5c06418446ef2f37407bddcc51b21c Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 16 Mar 2017 17:23:51 -0400 Subject: [PATCH 4/4] blockjob: add devops to blockjob backends This lets us hook into drained_begin and drained_end requests from the backend level, which is particularly useful for making sure that all jobs associated with a particular node (whether the source or the target) receive a drain request. Suggested-by: Kevin Wolf Signed-off-by: John Snow Reviewed-by: Jeff Cody Message-id: 20170316212351.13797-4-jsnow@redhat.com Signed-off-by: Jeff Cody --- blockjob.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/blockjob.c b/blockjob.c index 0e9ed0336d..9b619f385a 100644 --- a/blockjob.c +++ b/blockjob.c @@ -68,6 +68,23 @@ static const BdrvChildRole child_job = { .stay_at_node = true, }; +static void block_job_drained_begin(void *opaque) +{ + BlockJob *job = opaque; + block_job_pause(job); +} + +static void block_job_drained_end(void *opaque) +{ + BlockJob *job = opaque; + block_job_resume(job); +} + +static const BlockDevOps block_job_dev_ops = { + .drained_begin = block_job_drained_begin, + .drained_end = block_job_drained_end, +}; + BlockJob *block_job_next(BlockJob *job) { if (!job) { @@ -205,11 +222,6 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, } job = g_malloc0(driver->instance_size); - error_setg(&job->blocker, "block device is in use by block job: %s", - BlockJobType_lookup[driver->job_type]); - block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort); - bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker); - job->driver = driver; job->id = g_strdup(job_id); job->blk = blk; @@ -219,8 +231,15 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, job->paused = true; job->pause_count = 1; job->refcnt = 1; + + error_setg(&job->blocker, "block device is in use by block job: %s", + BlockJobType_lookup[driver->job_type]); + block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort); bs->job = job; + blk_set_dev_ops(blk, &block_job_dev_ops, job); + bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker); + QLIST_INSERT_HEAD(&block_jobs, job, job_list); blk_add_aio_context_notifier(blk, block_job_attached_aio_context,