diff --git a/debian/changelog b/debian/changelog index c8b8e5e..e38a77b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +pve-qemu-kvm (7.1.0-4+vitastor4) bullseye; urgency=medium + + * Improve performance by adding io_uring support + * Fix compatibility with iothread + + -- Vitaliy Filippov Tue, 18 Jul 2023 02:22:28 +0300 + pve-qemu-kvm (7.1.0-4+vitastor3) bullseye; urgency=medium * Add bdrv_co_block_status implementation for QCOW2 export support diff --git a/debian/control b/debian/control index 45e9ab9..2fd2435 100644 --- a/debian/control +++ b/debian/control @@ -62,6 +62,7 @@ Depends: ceph-common (>= 0.48), libspice-server1 (>= 0.14.0~), libusb-1.0-0 (>= 1.0.17-1), libusbredirparser1 (>= 0.6-2), + vitastor-client (>= 0.9.4), libuuid1, ${misc:Depends}, ${shlibs:Depends}, diff --git a/debian/patches/pve-qemu-7.1-vitastor.patch b/debian/patches/pve-qemu-7.1-vitastor.patch index 661eb5e..21eaa92 100644 --- a/debian/patches/pve-qemu-7.1-vitastor.patch +++ b/debian/patches/pve-qemu-7.1-vitastor.patch @@ -171,7 +171,7 @@ Index: a/block/vitastor.c =================================================================== --- /dev/null +++ a/block/vitastor.c -@@ -0,0 +1,797 @@ +@@ -0,0 +1,1017 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details) + @@ -183,6 +183,9 @@ Index: a/block/vitastor.c +#endif +#include "qemu/osdep.h" +#include "qemu/main-loop.h" ++#if QEMU_VERSION_MAJOR >= 8 ++#include "block/block-io.h" ++#endif +#include "block/block_int.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" @@ -206,6 +209,11 @@ Index: a/block/vitastor.c +#define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value))) +#define qobject_unref QDECREF +#endif ++#if QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 || QEMU_VERSION_MAJOR > 4 ++#include "sysemu/replay.h" ++#else ++#include "sysemu/sysemu.h" ++#endif + +#include "vitastor_c.h" + @@ -219,9 +227,13 @@ Index: a/block/vitastor.c +} +#endif + ++typedef struct VitastorFdData VitastorFdData; ++ +typedef struct VitastorClient +{ + void *proxy; ++ int uring_eventfd; ++ + void *watch; + char *config_path; + char *etcd_host; @@ -238,12 +250,24 @@ Index: a/block/vitastor.c + int rdma_gid_index; + int rdma_mtu; + QemuMutex mutex; ++ AioContext *ctx; ++ VitastorFdData **fds; ++ int fd_count, fd_alloc; ++ int bh_uring_scheduled; + + uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len; + uint32_t last_bitmap_granularity; + uint8_t *last_bitmap; +} VitastorClient; + ++typedef struct VitastorFdData ++{ ++ VitastorClient *cli; ++ int fd; ++ IOHandler *fd_read, *fd_write; ++ void *opaque; ++} VitastorFdData; ++ +typedef struct VitastorRPC +{ + BlockDriverState *bs; @@ -254,10 +278,21 @@ Index: a/block/vitastor.c + uint64_t inode, offset, len; + uint32_t bitmap_granularity; + uint8_t *bitmap; ++#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8 ++ QEMUBH *bh; ++#endif +} VitastorRPC; + ++#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8 ++typedef struct VitastorBH ++{ ++ VitastorClient *cli; ++ QEMUBH *bh; ++} VitastorBH; ++#endif ++ +static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task); -+static void vitastor_co_generic_bh_cb(void *opaque, long retval); ++static void vitastor_co_generic_cb(void *opaque, long retval); +static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version); +static void vitastor_close(BlockDriverState *bs); + @@ -373,6 +408,57 @@ Index: a/block/vitastor.c + return; +} + ++#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2 ++static void vitastor_uring_handler(void *opaque) ++{ ++ VitastorClient *client = (VitastorClient*)opaque; ++ qemu_mutex_lock(&client->mutex); ++ client->bh_uring_scheduled = 0; ++ do ++ { ++ vitastor_c_uring_handle_events(client->proxy); ++ } while (vitastor_c_uring_has_work(client->proxy)); ++ qemu_mutex_unlock(&client->mutex); ++} ++ ++#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8 ++static void vitastor_bh_uring_handler(void *opaque) ++{ ++ VitastorBH *vbh = opaque; ++ vitastor_bh_handler(vbh->cli); ++ qemu_bh_delete(vbh->bh); ++ free(vbh); ++} ++#endif ++ ++static void vitastor_schedule_uring_handler(VitastorClient *client) ++{ ++ void *opaque = client; ++ if (client->uring_eventfd >= 0 && !client->bh_uring_scheduled) ++ { ++ client->bh_uring_scheduled = 1; ++#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 ++ replay_bh_schedule_oneshot_event(client->ctx, vitastor_uring_handler, opaque); ++#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8 ++ aio_bh_schedule_oneshot(client->ctx, vitastor_uring_handler, opaque); ++#else ++ VitastorBH *vbh = (VitastorBH*)malloc(sizeof(VitastorBH)); ++ vbh->cli = client; ++#if QEMU_VERSION_MAJOR >= 2 ++ vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_uring_handler, vbh); ++#else ++ vbh->bh = qemu_bh_new(vitastor_bh_uring_handler, vbh); ++#endif ++ qemu_bh_schedule(vbh->bh); ++#endif ++ } ++} ++#else ++static void vitastor_schedule_uring_handler(VitastorClient *client) ++{ ++} ++#endif ++ +static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task) +{ + BlockDriverState *bs = task->bs; @@ -380,7 +466,8 @@ Index: a/block/vitastor.c + task->co = qemu_coroutine_self(); + + qemu_mutex_lock(&client->mutex); -+ vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_bh_cb, task); ++ vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task); ++ vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task->complete) @@ -389,13 +476,32 @@ Index: a/block/vitastor.c + } +} + -+static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque) ++static void vitastor_aio_fd_read(void *fddv) ++{ ++ VitastorFdData *fdd = (VitastorFdData*)fddv; ++ qemu_mutex_lock(&fdd->cli->mutex); ++ fdd->fd_read(fdd->opaque); ++ vitastor_schedule_uring_handler(fdd->cli); ++ qemu_mutex_unlock(&fdd->cli->mutex); ++} ++ ++static void vitastor_aio_fd_write(void *fddv) ++{ ++ VitastorFdData *fdd = (VitastorFdData*)fddv; ++ qemu_mutex_lock(&fdd->cli->mutex); ++ fdd->fd_write(fdd->opaque); ++ vitastor_schedule_uring_handler(fdd->cli); ++ qemu_mutex_unlock(&fdd->cli->mutex); ++} ++ ++static void universal_aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque) +{ + aio_set_fd_handler(ctx, fd, +#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3 + 0 /*is_external*/, +#endif -+ fd_read, fd_write, ++ fd_read, ++ fd_write, +#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1 + NULL /*io_flush*/, +#endif @@ -408,6 +514,53 @@ Index: a/block/vitastor.c + opaque); +} + ++static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque) ++{ ++ VitastorClient *client = (VitastorClient*)vcli; ++ VitastorFdData *fdd = NULL; ++ int i; ++ for (i = 0; i < client->fd_count; i++) ++ { ++ if (client->fds[i]->fd == fd) ++ { ++ if (fd_read || fd_write) ++ { ++ fdd = client->fds[i]; ++ fdd->opaque = opaque; ++ fdd->fd_read = fd_read; ++ fdd->fd_write = fd_write; ++ } ++ else ++ { ++ for (int j = i+1; j < client->fd_count; j++) ++ client->fds[j-1] = client->fds[j]; ++ client->fd_count--; ++ } ++ break; ++ } ++ } ++ if ((fd_read || fd_write) && !fdd) ++ { ++ fdd = (VitastorFdData*)malloc(sizeof(VitastorFdData)); ++ fdd->cli = client; ++ fdd->fd = fd; ++ fdd->fd_read = fd_read; ++ fdd->fd_write = fd_write; ++ fdd->opaque = opaque; ++ if (client->fd_count >= client->fd_alloc) ++ { ++ client->fd_alloc = client->fd_alloc*2; ++ if (client->fd_alloc < 16) ++ client->fd_alloc = 16; ++ client->fds = (VitastorFdData**)realloc(client->fds, sizeof(VitastorFdData*) * client->fd_alloc); ++ } ++ client->fds[client->fd_count++] = fdd; ++ } ++ universal_aio_set_fd_handler( ++ client->ctx, fd, fd_read ? vitastor_aio_fd_read : NULL, fd_write ? vitastor_aio_fd_write : NULL, fdd ++ ); ++} ++ +static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) +{ + VitastorRPC task; @@ -425,10 +578,36 @@ Index: a/block/vitastor.c + client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0); + client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0); + client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0); -+ client->proxy = vitastor_c_create_qemu( -+ vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix, ++ client->ctx = bdrv_get_aio_context(bs); ++#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2 ++ client->proxy = vitastor_c_create_qemu_uring( ++ vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix, + client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0 + ); ++ if (!client->proxy) ++ { ++ fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno)); ++ client->uring_eventfd = -1; ++#endif ++ client->proxy = vitastor_c_create_qemu( ++ vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix, ++ client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0 ++ ); ++#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2 ++ } ++ else ++ { ++ client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy); ++ if (client->uring_eventfd < 0) ++ { ++ fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno)); ++ error_setg(errp, "failed to create io_uring eventfd"); ++ vitastor_close(bs); ++ return -1; ++ } ++ universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client); ++ } ++#endif + image = client->image = g_strdup(qdict_get_try_str(options, "image")); + client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0; + // Get image metadata (size and readonly flag) or just wait until the client is ready @@ -442,7 +621,13 @@ Index: a/block/vitastor.c + } + else + { ++#if QEMU_VERSION_MAJOR >= 8 ++ aio_co_enter(bdrv_get_aio_context(bs), qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task)); ++#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3 + bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task)); ++#else ++ qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task)); ++#endif + BDRV_POLL_WHILE(bs, !task.complete); + } + client->image = image; @@ -502,6 +687,12 @@ Index: a/block/vitastor.c +{ + VitastorClient *client = bs->opaque; + vitastor_c_destroy(client->proxy); ++ if (client->fds) ++ { ++ free(client->fds); ++ client->fds = NULL; ++ client->fd_alloc = client->fd_count = 0; ++ } + qemu_mutex_destroy(&client->mutex); + if (client->config_path) + g_free(client->config_path); @@ -618,25 +809,44 @@ Index: a/block/vitastor.c + }; +} + -+static void vitastor_co_generic_bh_cb(void *opaque, long retval) ++static void vitastor_co_generic_bh_cb(void *opaque) +{ + VitastorRPC *task = opaque; -+ task->ret = retval; + task->complete = 1; + if (qemu_coroutine_self() != task->co) + { +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8 + aio_co_wake(task->co); +#else ++#if QEMU_VERSION_MAJOR == 2 ++ qemu_bh_delete(task->bh); ++#endif + qemu_coroutine_enter(task->co, NULL); + qemu_aio_release(task); +#endif + } +} + ++static void vitastor_co_generic_cb(void *opaque, long retval) ++{ ++ VitastorRPC *task = opaque; ++ task->ret = retval; ++#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 ++ replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); ++#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8 ++ aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); ++#elif QEMU_VERSION_MAJOR >= 2 ++ task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); ++ qemu_bh_schedule(task->bh); ++#else ++ task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque); ++ qemu_bh_schedule(task->bh); ++#endif ++} ++ +static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version) +{ -+ vitastor_co_generic_bh_cb(opaque, retval); ++ vitastor_co_generic_cb(opaque, retval); +} + +static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs, @@ -655,6 +865,7 @@ Index: a/block/vitastor.c + uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode; + qemu_mutex_lock(&client->mutex); + vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task); ++ vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task.complete) @@ -687,7 +898,8 @@ Index: a/block/vitastor.c + + uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode; + qemu_mutex_lock(&client->mutex); -+ vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task); ++ vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task); ++ vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task.complete) @@ -705,7 +917,6 @@ Index: a/block/vitastor.c + VitastorRPC *task = opaque; + VitastorClient *client = task->bs->opaque; + task->ret = retval; -+ task->complete = 1; + if (retval >= 0) + { + task->bitmap = bitmap; @@ -717,15 +928,17 @@ Index: a/block/vitastor.c + client->last_bitmap = bitmap; + } + } -+ if (qemu_coroutine_self() != task->co) -+ { -+#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8 -+ aio_co_wake(task->co); ++#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 ++ replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); ++#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8 ++ aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); ++#elif QEMU_VERSION_MAJOR >= 2 ++ task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); ++ qemu_bh_schedule(task->bh); +#else -+ qemu_coroutine_enter(task->co, NULL); -+ qemu_aio_release(task); ++ task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque); ++ qemu_bh_schedule(task->bh); +#endif -+ } +} + +static int coroutine_fn vitastor_co_block_status( @@ -766,6 +979,7 @@ Index: a/block/vitastor.c + task.bitmap = client->last_bitmap = NULL; + qemu_mutex_lock(&client->mutex); + vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task); ++ vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + while (!task.complete) + { @@ -851,7 +1065,8 @@ Index: a/block/vitastor.c + vitastor_co_init_task(bs, &task); + + qemu_mutex_lock(&client->mutex); -+ vitastor_c_sync(client->proxy, vitastor_co_generic_bh_cb, &task); ++ vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task); ++ vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task.complete) @@ -906,8 +1121,13 @@ Index: a/block/vitastor.c + .bdrv_parse_filename = vitastor_parse_filename, + + .bdrv_has_zero_init = bdrv_has_zero_init_1, ++#if QEMU_VERSION_MAJOR >= 8 ++ .bdrv_co_get_info = vitastor_get_info, ++ .bdrv_co_getlength = vitastor_getlength, ++#else + .bdrv_get_info = vitastor_get_info, + .bdrv_getlength = vitastor_getlength, ++#endif +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2 + .bdrv_probe_blocksizes = vitastor_probe_blocksizes, +#endif