Improve performance by adding io_uring support, fix qemu-img deadlocks

v8.0.2-3+vitastor2
Vitaliy Filippov 2023-07-19 02:07:35 +03:00
parent 40f9a6f1c2
commit a417b83c07
3 changed files with 138 additions and 27 deletions

7
debian/changelog vendored
View File

@ -1,3 +1,10 @@
pve-qemu-kvm (8.0.2-3+vitastor2) bookworm; urgency=medium
* Improve performance by adding io_uring support
* Fix qemu-img deadlocks after iothread fixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Tue, 19 Jul 2023 02:07:02 +0300
pve-qemu-kvm (8.0.2-3+vitastor1) bookworm; urgency=medium
* Add Vitastor support

1
debian/control vendored
View File

@ -58,6 +58,7 @@ Depends: ceph-common (>= 0.48),
libspice-server1 (>= 0.14.0~),
libusb-1.0-0 (>= 1.0.17-1),
libusbredirparser1 (>= 0.6-2),
vitastor-client (>= 0.9.4),
libuuid1,
${misc:Depends},
${shlibs:Depends},

View File

@ -192,7 +192,7 @@ Index: a/block/vitastor.c
===================================================================
--- /dev/null
+++ a/block/vitastor.c
@@ -0,0 +1,914 @@
@@ -0,0 +1,1017 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
@ -253,6 +253,8 @@ Index: a/block/vitastor.c
+typedef struct VitastorClient
+{
+ void *proxy;
+ int uring_eventfd;
+
+ void *watch;
+ char *config_path;
+ char *etcd_host;
@ -272,6 +274,7 @@ Index: a/block/vitastor.c
+ AioContext *ctx;
+ VitastorFdData **fds;
+ int fd_count, fd_alloc;
+ int bh_uring_scheduled;
+
+ uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
+ uint32_t last_bitmap_granularity;
@ -301,6 +304,14 @@ Index: a/block/vitastor.c
+#endif
+} VitastorRPC;
+
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
+typedef struct VitastorBH
+{
+ VitastorClient *cli;
+ QEMUBH *bh;
+} VitastorBH;
+#endif
+
+static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
+static void vitastor_co_generic_cb(void *opaque, long retval);
+static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version);
@ -418,6 +429,57 @@ Index: a/block/vitastor.c
+ return;
+}
+
+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
+static void vitastor_uring_handler(void *opaque)
+{
+ VitastorClient *client = (VitastorClient*)opaque;
+ qemu_mutex_lock(&client->mutex);
+ client->bh_uring_scheduled = 0;
+ do
+ {
+ vitastor_c_uring_handle_events(client->proxy);
+ } while (vitastor_c_uring_has_work(client->proxy));
+ qemu_mutex_unlock(&client->mutex);
+}
+
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
+static void vitastor_bh_uring_handler(void *opaque)
+{
+ VitastorBH *vbh = opaque;
+ vitastor_bh_handler(vbh->cli);
+ qemu_bh_delete(vbh->bh);
+ free(vbh);
+}
+#endif
+
+static void vitastor_schedule_uring_handler(VitastorClient *client)
+{
+ void *opaque = client;
+ if (client->uring_eventfd >= 0 && !client->bh_uring_scheduled)
+ {
+ client->bh_uring_scheduled = 1;
+#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
+ replay_bh_schedule_oneshot_event(client->ctx, vitastor_uring_handler, opaque);
+#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
+ aio_bh_schedule_oneshot(client->ctx, vitastor_uring_handler, opaque);
+#else
+ VitastorBH *vbh = (VitastorBH*)malloc(sizeof(VitastorBH));
+ vbh->cli = client;
+#if QEMU_VERSION_MAJOR >= 2
+ vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_uring_handler, vbh);
+#else
+ vbh->bh = qemu_bh_new(vitastor_bh_uring_handler, vbh);
+#endif
+ qemu_bh_schedule(vbh->bh);
+#endif
+ }
+}
+#else
+static void vitastor_schedule_uring_handler(VitastorClient *client)
+{
+}
+#endif
+
+static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
+{
+ BlockDriverState *bs = task->bs;
@ -426,6 +488,7 @@ Index: a/block/vitastor.c
+
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task->complete)
@ -439,6 +502,7 @@ Index: a/block/vitastor.c
+ VitastorFdData *fdd = (VitastorFdData*)fddv;
+ qemu_mutex_lock(&fdd->cli->mutex);
+ fdd->fd_read(fdd->opaque);
+ vitastor_schedule_uring_handler(fdd->cli);
+ qemu_mutex_unlock(&fdd->cli->mutex);
+}
+
@ -447,9 +511,30 @@ Index: a/block/vitastor.c
+ VitastorFdData *fdd = (VitastorFdData*)fddv;
+ qemu_mutex_lock(&fdd->cli->mutex);
+ fdd->fd_write(fdd->opaque);
+ vitastor_schedule_uring_handler(fdd->cli);
+ qemu_mutex_unlock(&fdd->cli->mutex);
+}
+
+static void universal_aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque)
+{
+ aio_set_fd_handler(ctx, fd,
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
+ 0 /*is_external*/,
+#endif
+ fd_read,
+ fd_write,
+#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
+ NULL /*io_flush*/,
+#endif
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
+ NULL /*io_poll*/,
+#endif
+#if QEMU_VERSION_MAJOR >= 7
+ NULL /*io_poll_ready*/,
+#endif
+ opaque);
+}
+
+static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
+{
+ VitastorClient *client = (VitastorClient*)vcli;
@ -492,22 +577,9 @@ Index: a/block/vitastor.c
+ }
+ client->fds[client->fd_count++] = fdd;
+ }
+ aio_set_fd_handler(client->ctx, fd,
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
+ 0 /*is_external*/,
+#endif
+ fd_read ? vitastor_aio_fd_read : NULL,
+ fd_write ? vitastor_aio_fd_write : NULL,
+#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
+ NULL /*io_flush*/,
+#endif
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
+ NULL /*io_poll*/,
+#endif
+#if QEMU_VERSION_MAJOR >= 7
+ NULL /*io_poll_ready*/,
+#endif
+ fdd);
+ universal_aio_set_fd_handler(
+ client->ctx, fd, fd_read ? vitastor_aio_fd_read : NULL, fd_write ? vitastor_aio_fd_write : NULL, fdd
+ );
+}
+
+static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
@ -528,10 +600,35 @@ Index: a/block/vitastor.c
+ client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
+ client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
+ client->ctx = bdrv_get_aio_context(bs);
+ client->proxy = vitastor_c_create_qemu(
+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
+ client->proxy = vitastor_c_create_qemu_uring(
+ vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
+ client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
+ );
+ if (!client->proxy)
+ {
+ fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
+ client->uring_eventfd = -1;
+#endif
+ client->proxy = vitastor_c_create_qemu(
+ vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
+ client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
+ );
+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
+ }
+ else
+ {
+ client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy);
+ if (client->uring_eventfd < 0)
+ {
+ fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno));
+ error_setg(errp, "failed to create io_uring eventfd");
+ vitastor_close(bs);
+ return -1;
+ }
+ universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
+ }
+#endif
+ image = client->image = g_strdup(qdict_get_try_str(options, "image"));
+ client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
+ // Get image metadata (size and readonly flag) or just wait until the client is ready
@ -763,7 +860,8 @@ Index: a/block/vitastor.c
+ task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#else
+ vitastor_co_generic_bh_cb(opaque);
+ task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#endif
+}
+
@ -788,6 +886,7 @@ Index: a/block/vitastor.c
+ uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task.complete)
@ -821,6 +920,7 @@ Index: a/block/vitastor.c
+ uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task.complete)
@ -838,7 +938,6 @@ Index: a/block/vitastor.c
+ VitastorRPC *task = opaque;
+ VitastorClient *client = task->bs->opaque;
+ task->ret = retval;
+ task->complete = 1;
+ if (retval >= 0)
+ {
+ task->bitmap = bitmap;
@ -850,15 +949,17 @@ Index: a/block/vitastor.c
+ client->last_bitmap = bitmap;
+ }
+ }
+ if (qemu_coroutine_self() != task->co)
+ {
+#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
+ aio_co_wake(task->co);
+#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
+ replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
+ aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+#elif QEMU_VERSION_MAJOR >= 2
+ task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#else
+ qemu_coroutine_enter(task->co, NULL);
+ qemu_aio_release(task);
+ task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#endif
+ }
+}
+
+static int coroutine_fn vitastor_co_block_status(
@ -899,6 +1000,7 @@ Index: a/block/vitastor.c
+ task.bitmap = client->last_bitmap = NULL;
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+ while (!task.complete)
+ {
@ -985,6 +1087,7 @@ Index: a/block/vitastor.c
+
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task.complete)