Improve performance by adding io_uring support, fix iothread compat

v5.2.0-11+vitastor3
Vitaliy Filippov 2023-07-19 02:36:33 +03:00
parent 9681496330
commit 1d91a9221e
3 changed files with 247 additions and 20 deletions

6
debian/changelog vendored
View File

@ -1,3 +1,9 @@
pve-qemu-kvm (5.2.0-11+vitastor3) buster; urgency=medium
* Improve performance by adding io_uring support, fix iothread compat
-- Vitaliy Filippov <vitalif@yourcmc.ru> Wed, 19 Jul 2023 02:35:47 +0300
pve-qemu-kvm (5.2.0-11+vitastor2) buster; urgency=medium
* Add bdrv_co_block_status implementation for QCOW2 export support

1
debian/control vendored
View File

@ -58,6 +58,7 @@ Depends: ceph-common (>= 0.48),
libspice-server1 (>= 0.14.0~),
libusb-1.0-0 (>= 1.0.17-1),
libusbredirparser1 (>= 0.6-2),
vitastor-client (>= 0.9.4),
libuuid1,
numactl,
${misc:Depends},

View File

@ -183,7 +183,7 @@ Index: a/block/vitastor.c
===================================================================
--- /dev/null
+++ a/block/vitastor.c
@@ -0,0 +1,797 @@
@@ -0,0 +1,1017 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
@ -195,6 +195,9 @@ Index: a/block/vitastor.c
+#endif
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#if QEMU_VERSION_MAJOR >= 8
+#include "block/block-io.h"
+#endif
+#include "block/block_int.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
@ -218,6 +221,11 @@ Index: a/block/vitastor.c
+#define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value)))
+#define qobject_unref QDECREF
+#endif
+#if QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 || QEMU_VERSION_MAJOR > 4
+#include "sysemu/replay.h"
+#else
+#include "sysemu/sysemu.h"
+#endif
+
+#include "vitastor_c.h"
+
@ -231,9 +239,13 @@ Index: a/block/vitastor.c
+}
+#endif
+
+typedef struct VitastorFdData VitastorFdData;
+
+typedef struct VitastorClient
+{
+ void *proxy;
+ int uring_eventfd;
+
+ void *watch;
+ char *config_path;
+ char *etcd_host;
@ -250,12 +262,24 @@ Index: a/block/vitastor.c
+ int rdma_gid_index;
+ int rdma_mtu;
+ QemuMutex mutex;
+ AioContext *ctx;
+ VitastorFdData **fds;
+ int fd_count, fd_alloc;
+ int bh_uring_scheduled;
+
+ uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
+ uint32_t last_bitmap_granularity;
+ uint8_t *last_bitmap;
+} VitastorClient;
+
+typedef struct VitastorFdData
+{
+ VitastorClient *cli;
+ int fd;
+ IOHandler *fd_read, *fd_write;
+ void *opaque;
+} VitastorFdData;
+
+typedef struct VitastorRPC
+{
+ BlockDriverState *bs;
@ -266,10 +290,21 @@ Index: a/block/vitastor.c
+ uint64_t inode, offset, len;
+ uint32_t bitmap_granularity;
+ uint8_t *bitmap;
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
+ QEMUBH *bh;
+#endif
+} VitastorRPC;
+
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
+typedef struct VitastorBH
+{
+ VitastorClient *cli;
+ QEMUBH *bh;
+} VitastorBH;
+#endif
+
+static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
+static void vitastor_co_generic_bh_cb(void *opaque, long retval);
+static void vitastor_co_generic_cb(void *opaque, long retval);
+static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version);
+static void vitastor_close(BlockDriverState *bs);
+
@ -385,6 +420,57 @@ Index: a/block/vitastor.c
+ return;
+}
+
+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
+static void vitastor_uring_handler(void *opaque)
+{
+ VitastorClient *client = (VitastorClient*)opaque;
+ qemu_mutex_lock(&client->mutex);
+ client->bh_uring_scheduled = 0;
+ do
+ {
+ vitastor_c_uring_handle_events(client->proxy);
+ } while (vitastor_c_uring_has_work(client->proxy));
+ qemu_mutex_unlock(&client->mutex);
+}
+
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
+static void vitastor_bh_uring_handler(void *opaque)
+{
+ VitastorBH *vbh = opaque;
+ vitastor_bh_handler(vbh->cli);
+ qemu_bh_delete(vbh->bh);
+ free(vbh);
+}
+#endif
+
+static void vitastor_schedule_uring_handler(VitastorClient *client)
+{
+ void *opaque = client;
+ if (client->uring_eventfd >= 0 && !client->bh_uring_scheduled)
+ {
+ client->bh_uring_scheduled = 1;
+#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
+ replay_bh_schedule_oneshot_event(client->ctx, vitastor_uring_handler, opaque);
+#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
+ aio_bh_schedule_oneshot(client->ctx, vitastor_uring_handler, opaque);
+#else
+ VitastorBH *vbh = (VitastorBH*)malloc(sizeof(VitastorBH));
+ vbh->cli = client;
+#if QEMU_VERSION_MAJOR >= 2
+ vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_uring_handler, vbh);
+#else
+ vbh->bh = qemu_bh_new(vitastor_bh_uring_handler, vbh);
+#endif
+ qemu_bh_schedule(vbh->bh);
+#endif
+ }
+}
+#else
+static void vitastor_schedule_uring_handler(VitastorClient *client)
+{
+}
+#endif
+
+static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
+{
+ BlockDriverState *bs = task->bs;
@ -392,7 +478,8 @@ Index: a/block/vitastor.c
+ task->co = qemu_coroutine_self();
+
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_bh_cb, task);
+ vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task->complete)
@ -401,13 +488,32 @@ Index: a/block/vitastor.c
+ }
+}
+
+static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
+static void vitastor_aio_fd_read(void *fddv)
+{
+ VitastorFdData *fdd = (VitastorFdData*)fddv;
+ qemu_mutex_lock(&fdd->cli->mutex);
+ fdd->fd_read(fdd->opaque);
+ vitastor_schedule_uring_handler(fdd->cli);
+ qemu_mutex_unlock(&fdd->cli->mutex);
+}
+
+static void vitastor_aio_fd_write(void *fddv)
+{
+ VitastorFdData *fdd = (VitastorFdData*)fddv;
+ qemu_mutex_lock(&fdd->cli->mutex);
+ fdd->fd_write(fdd->opaque);
+ vitastor_schedule_uring_handler(fdd->cli);
+ qemu_mutex_unlock(&fdd->cli->mutex);
+}
+
+static void universal_aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque)
+{
+ aio_set_fd_handler(ctx, fd,
+#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
+ 0 /*is_external*/,
+#endif
+ fd_read, fd_write,
+ fd_read,
+ fd_write,
+#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
+ NULL /*io_flush*/,
+#endif
@ -420,6 +526,53 @@ Index: a/block/vitastor.c
+ opaque);
+}
+
+static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
+{
+ VitastorClient *client = (VitastorClient*)vcli;
+ VitastorFdData *fdd = NULL;
+ int i;
+ for (i = 0; i < client->fd_count; i++)
+ {
+ if (client->fds[i]->fd == fd)
+ {
+ if (fd_read || fd_write)
+ {
+ fdd = client->fds[i];
+ fdd->opaque = opaque;
+ fdd->fd_read = fd_read;
+ fdd->fd_write = fd_write;
+ }
+ else
+ {
+ for (int j = i+1; j < client->fd_count; j++)
+ client->fds[j-1] = client->fds[j];
+ client->fd_count--;
+ }
+ break;
+ }
+ }
+ if ((fd_read || fd_write) && !fdd)
+ {
+ fdd = (VitastorFdData*)malloc(sizeof(VitastorFdData));
+ fdd->cli = client;
+ fdd->fd = fd;
+ fdd->fd_read = fd_read;
+ fdd->fd_write = fd_write;
+ fdd->opaque = opaque;
+ if (client->fd_count >= client->fd_alloc)
+ {
+ client->fd_alloc = client->fd_alloc*2;
+ if (client->fd_alloc < 16)
+ client->fd_alloc = 16;
+ client->fds = (VitastorFdData**)realloc(client->fds, sizeof(VitastorFdData*) * client->fd_alloc);
+ }
+ client->fds[client->fd_count++] = fdd;
+ }
+ universal_aio_set_fd_handler(
+ client->ctx, fd, fd_read ? vitastor_aio_fd_read : NULL, fd_write ? vitastor_aio_fd_write : NULL, fdd
+ );
+}
+
+static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
+{
+ VitastorRPC task;
@ -437,10 +590,36 @@ Index: a/block/vitastor.c
+ client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
+ client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
+ client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
+ client->proxy = vitastor_c_create_qemu(
+ vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
+ client->ctx = bdrv_get_aio_context(bs);
+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
+ client->proxy = vitastor_c_create_qemu_uring(
+ vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
+ client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
+ );
+ if (!client->proxy)
+ {
+ fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
+ client->uring_eventfd = -1;
+#endif
+ client->proxy = vitastor_c_create_qemu(
+ vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
+ client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
+ );
+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
+ }
+ else
+ {
+ client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy);
+ if (client->uring_eventfd < 0)
+ {
+ fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno));
+ error_setg(errp, "failed to create io_uring eventfd");
+ vitastor_close(bs);
+ return -1;
+ }
+ universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
+ }
+#endif
+ image = client->image = g_strdup(qdict_get_try_str(options, "image"));
+ client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
+ // Get image metadata (size and readonly flag) or just wait until the client is ready
@ -454,7 +633,13 @@ Index: a/block/vitastor.c
+ }
+ else
+ {
+#if QEMU_VERSION_MAJOR >= 8
+ aio_co_enter(bdrv_get_aio_context(bs), qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
+#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
+ bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
+#else
+ qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
+#endif
+ BDRV_POLL_WHILE(bs, !task.complete);
+ }
+ client->image = image;
@ -514,6 +699,12 @@ Index: a/block/vitastor.c
+{
+ VitastorClient *client = bs->opaque;
+ vitastor_c_destroy(client->proxy);
+ if (client->fds)
+ {
+ free(client->fds);
+ client->fds = NULL;
+ client->fd_alloc = client->fd_count = 0;
+ }
+ qemu_mutex_destroy(&client->mutex);
+ if (client->config_path)
+ g_free(client->config_path);
@ -630,25 +821,44 @@ Index: a/block/vitastor.c
+ };
+}
+
+static void vitastor_co_generic_bh_cb(void *opaque, long retval)
+static void vitastor_co_generic_bh_cb(void *opaque)
+{
+ VitastorRPC *task = opaque;
+ task->ret = retval;
+ task->complete = 1;
+ if (qemu_coroutine_self() != task->co)
+ {
+#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
+ aio_co_wake(task->co);
+#else
+#if QEMU_VERSION_MAJOR == 2
+ qemu_bh_delete(task->bh);
+#endif
+ qemu_coroutine_enter(task->co, NULL);
+ qemu_aio_release(task);
+#endif
+ }
+}
+
+static void vitastor_co_generic_cb(void *opaque, long retval)
+{
+ VitastorRPC *task = opaque;
+ task->ret = retval;
+#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
+ replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
+ aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+#elif QEMU_VERSION_MAJOR >= 2
+ task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#else
+ task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#endif
+}
+
+static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version)
+{
+ vitastor_co_generic_bh_cb(opaque, retval);
+ vitastor_co_generic_cb(opaque, retval);
+}
+
+static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
@ -667,6 +877,7 @@ Index: a/block/vitastor.c
+ uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task.complete)
@ -699,7 +910,8 @@ Index: a/block/vitastor.c
+
+ uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
+ vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task.complete)
@ -717,7 +929,6 @@ Index: a/block/vitastor.c
+ VitastorRPC *task = opaque;
+ VitastorClient *client = task->bs->opaque;
+ task->ret = retval;
+ task->complete = 1;
+ if (retval >= 0)
+ {
+ task->bitmap = bitmap;
@ -729,15 +940,17 @@ Index: a/block/vitastor.c
+ client->last_bitmap = bitmap;
+ }
+ }
+ if (qemu_coroutine_self() != task->co)
+ {
+#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
+ aio_co_wake(task->co);
+#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
+ replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
+ aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+#elif QEMU_VERSION_MAJOR >= 2
+ task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#else
+ qemu_coroutine_enter(task->co, NULL);
+ qemu_aio_release(task);
+ task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
+ qemu_bh_schedule(task->bh);
+#endif
+ }
+}
+
+static int coroutine_fn vitastor_co_block_status(
@ -778,6 +991,7 @@ Index: a/block/vitastor.c
+ task.bitmap = client->last_bitmap = NULL;
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+ while (!task.complete)
+ {
@ -863,7 +1077,8 @@ Index: a/block/vitastor.c
+ vitastor_co_init_task(bs, &task);
+
+ qemu_mutex_lock(&client->mutex);
+ vitastor_c_sync(client->proxy, vitastor_co_generic_bh_cb, &task);
+ vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
+ vitastor_schedule_uring_handler(client);
+ qemu_mutex_unlock(&client->mutex);
+
+ while (!task.complete)
@ -918,8 +1133,13 @@ Index: a/block/vitastor.c
+ .bdrv_parse_filename = vitastor_parse_filename,
+
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
+#if QEMU_VERSION_MAJOR >= 8
+ .bdrv_co_get_info = vitastor_get_info,
+ .bdrv_co_getlength = vitastor_getlength,
+#else
+ .bdrv_get_info = vitastor_get_info,
+ .bdrv_getlength = vitastor_getlength,
+#endif
+#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
+ .bdrv_probe_blocksizes = vitastor_probe_blocksizes,
+#endif