From 4473eb5512cf6b9434371814dd5c16d85177f9b2 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 14 Jun 2024 02:15:00 +0300 Subject: [PATCH] Fix slow & failing CAS layer merge --- src/client/cluster_client.cpp | 17 +++++++++++------ src/client/cluster_client.h | 2 -- src/cmd/cli_merge.cpp | 13 ++++++++++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/client/cluster_client.cpp b/src/client/cluster_client.cpp index 3a92031b..f04dcc68 100644 --- a/src/client/cluster_client.cpp +++ b/src/client/cluster_client.cpp @@ -77,11 +77,6 @@ cluster_client_t::~cluster_client_t() cluster_op_t::~cluster_op_t() { - if (buf) - { - free(buf); - buf = NULL; - } if (bitmap_buf) { free(bitmap_buf); @@ -570,6 +565,14 @@ void cluster_client_t::execute_internal(cluster_op_t *op) { op->cur_inode = op->inode; op->retval = 0; + op->state = 0; + op->retry_after = 0; + op->inflight_count = 0; + op->done_count = 0; + op->part_bitmaps = NULL; + op->bitmap_buf_size = 0; + op->prev_wait = 0; + assert(!op->prev && !op->next); // check alignment, readonly flag and so on if (!check_rw(op)) { @@ -1210,7 +1213,9 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part) // So do all these things after modifying operation state, otherwise we may hit reenterability bugs // FIXME postpone such things to set_immediate here to avoid bugs // Set op->retry_after to retry operation after a short pause (not immediately) - if (!op->retry_after) + if (!op->retry_after && (op->retval == -EPIPE || + op->retval == -EIO && client_eio_retry_interval || + op->retval == -ENOSPC && client_retry_enospc)) { op->retry_after = op->retval != -EPIPE ? client_eio_retry_interval : client_retry_interval; } diff --git a/src/client/cluster_client.h b/src/client/cluster_client.h index c77c328a..a424404b 100644 --- a/src/client/cluster_client.h +++ b/src/client/cluster_client.h @@ -56,8 +56,6 @@ struct cluster_op_t protected: int state = 0; uint64_t cur_inode; // for snapshot reads - void *buf = NULL; - cluster_op_t *orig_op = NULL; bool needs_reslice = false; int retry_after = 0; int inflight_count = 0, done_count = 0; diff --git a/src/cmd/cli_merge.cpp b/src/cmd/cli_merge.cpp index fd980985..65677e92 100644 --- a/src/cmd/cli_merge.cpp +++ b/src/cmd/cli_merge.cpp @@ -479,10 +479,14 @@ struct snap_merger_t { if (op->retval != op->len) { - rwo->error_code = -op->retval; + rwo->error_code = op->retval; rwo->error_offset = op->offset; rwo->error_read = true; } + else + { + rwo->error_code = 0; + } continue_rwo.push_back(rwo); parent->ringloop->wakeup(); }; @@ -553,12 +557,15 @@ struct snap_merger_t if (use_cas && subop->retval == -EINTR) { // CAS failure - reread and repeat optimistically + assert(rwo->todo == 1); // initial refcount from read_and_write + rwo->error_code = -EINTR; rwo->start = rwo->end = 0; + rwo->op.version = 0; rwo_read(rwo); delete subop; return; } - rwo->error_code = -subop->retval; + rwo->error_code = subop->retval; rwo->error_offset = subop->offset; rwo->error_read = false; } @@ -633,7 +640,7 @@ struct snap_merger_t { char buf[1024]; snprintf(buf, 1024, "Error %s target at offset %jx: %s", - rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code)); + rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(-rwo->error_code)); rwo_error = std::string(buf); } delete rwo;