diff --git a/include/block/aio.h b/include/block/aio.h index f07ebb76b8..cb1989105a 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -227,6 +227,14 @@ struct AioContext { int64_t poll_grow; /* polling time growth factor */ int64_t poll_shrink; /* polling time shrink factor */ + /* + * List of handlers participating in userspace polling. Protected by + * ctx->list_lock. Iterated and modified mostly by the event loop thread + * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() + * only touches the list to delete nodes if ctx->list_lock's count is zero. + */ + AioHandlerList poll_aio_handlers; + /* Are we in polling mode or monitoring file descriptors? */ bool poll_started; diff --git a/util/aio-posix.c b/util/aio-posix.c index 759989b45b..cd6cf0a4a9 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -22,6 +22,9 @@ #include "trace.h" #include "aio-posix.h" +/* Stop userspace polling on a handler if it isn't active for some time */ +#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) + bool aio_poll_disabled(AioContext *ctx) { return atomic_read(&ctx->poll_disable_cnt); @@ -78,6 +81,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) * deleted because deleted nodes are only cleaned up while * no one is walking the handlers list. */ + QLIST_SAFE_REMOVE(node, node_poll); QLIST_REMOVE(node, node); return true; } @@ -205,7 +209,7 @@ static bool poll_set_started(AioContext *ctx, bool started) ctx->poll_started = started; qemu_lockcnt_inc(&ctx->list_lock); - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { IOHandler *fn; if (QLIST_IS_INSERTED(node, node_deleted)) { @@ -286,6 +290,7 @@ static void aio_free_deleted_handlers(AioContext *ctx) while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) { QLIST_REMOVE(node, node); QLIST_REMOVE(node, node_deleted); + QLIST_SAFE_REMOVE(node, node_poll); g_free(node); } @@ -300,6 +305,22 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) revents = node->pfd.revents & node->pfd.events; node->pfd.revents = 0; + /* + * Start polling AioHandlers when they become ready because activity is + * likely to continue. Note that starvation is theoretically possible when + * fdmon_supports_polling(), but only until the fd fires for the first + * time. + */ + if (!QLIST_IS_INSERTED(node, node_deleted) && + !QLIST_IS_INSERTED(node, node_poll) && + node->io_poll) { + trace_poll_add(ctx, node, node->pfd.fd, revents); + if (ctx->poll_started && node->io_poll_begin) { + node->io_poll_begin(node->opaque); + } + QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); + } + if (!QLIST_IS_INSERTED(node, node_deleted) && (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && aio_node_check(ctx, node->is_external) && @@ -364,15 +385,19 @@ void aio_dispatch(AioContext *ctx) timerlistgroup_run_timers(&ctx->tlg); } -static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) +static bool run_poll_handlers_once(AioContext *ctx, + int64_t now, + int64_t *timeout) { bool progress = false; AioHandler *node; + AioHandler *tmp; - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { - if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && - aio_node_check(ctx, node->is_external) && + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { + if (aio_node_check(ctx, node->is_external) && node->io_poll(node->opaque)) { + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; + /* * Polling was successful, exit try_poll_mode immediately * to adjust the next polling time. @@ -389,6 +414,50 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) return progress; } +static bool fdmon_supports_polling(AioContext *ctx) +{ + return ctx->fdmon_ops->need_wait != aio_poll_disabled; +} + +static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) +{ + AioHandler *node; + AioHandler *tmp; + bool progress = false; + + /* + * File descriptor monitoring implementations without userspace polling + * support suffer from starvation when a subset of handlers is polled + * because fds will not be processed in a timely fashion. Don't remove + * idle poll handlers. + */ + if (!fdmon_supports_polling(ctx)) { + return false; + } + + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { + if (node->poll_idle_timeout == 0LL) { + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; + } else if (now >= node->poll_idle_timeout) { + trace_poll_remove(ctx, node, node->pfd.fd); + node->poll_idle_timeout = 0LL; + QLIST_SAFE_REMOVE(node, node_poll); + if (ctx->poll_started && node->io_poll_end) { + node->io_poll_end(node->opaque); + + /* + * Final poll in case ->io_poll_end() races with an event. + * Nevermind about re-adding the handler in the rare case where + * this causes progress. + */ + progress = node->io_poll(node->opaque) || progress; + } + } + } + + return progress; +} + /* run_poll_handlers: * @ctx: the AioContext * @max_ns: maximum time to poll for, in nanoseconds @@ -424,12 +493,17 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); do { - progress = run_poll_handlers_once(ctx, timeout); + progress = run_poll_handlers_once(ctx, start_time, timeout); elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; max_ns = qemu_soonest_timeout(*timeout, max_ns); assert(!(max_ns && progress)); } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); + if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) { + *timeout = 0; + progress = true; + } + /* If time has passed with no successful polling, adjust *timeout to * keep the same ending time. */ @@ -454,8 +528,13 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) */ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) { - int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); + int64_t max_ns; + if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { + return false; + } + + max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { poll_set_started(ctx, true); diff --git a/util/aio-posix.h b/util/aio-posix.h index 55fc771327..c80c04506a 100644 --- a/util/aio-posix.h +++ b/util/aio-posix.h @@ -30,10 +30,12 @@ struct AioHandler { QLIST_ENTRY(AioHandler) node; QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ QLIST_ENTRY(AioHandler) node_deleted; + QLIST_ENTRY(AioHandler) node_poll; #ifdef CONFIG_LINUX_IO_URING QSLIST_ENTRY(AioHandler) node_submitted; unsigned flags; /* see fdmon-io_uring.c */ #endif + int64_t poll_idle_timeout; /* when to stop userspace polling */ bool is_external; }; diff --git a/util/trace-events b/util/trace-events index 83b6639018..0ce42822eb 100644 --- a/util/trace-events +++ b/util/trace-events @@ -5,6 +5,8 @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64 poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 +poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x" +poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d" # async.c aio_co_schedule(void *ctx, void *co) "ctx %p co %p"