Fix slow & failing CAS layer merge

Vitaliy Filippov 2024-06-14 02:15:00 +03:00
parent 6501abc060
commit 4473eb5512
3 changed files with 21 additions and 11 deletions

View File

@ -77,11 +77,6 @@ cluster_client_t::~cluster_client_t()
cluster_op_t::~cluster_op_t()
{
if (buf)
{
free(buf);
buf = NULL;
}
if (bitmap_buf)
{
free(bitmap_buf);
@ -570,6 +565,14 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
{
op->cur_inode = op->inode;
op->retval = 0;
op->state = 0;
op->retry_after = 0;
op->inflight_count = 0;
op->done_count = 0;
op->part_bitmaps = NULL;
op->bitmap_buf_size = 0;
op->prev_wait = 0;
assert(!op->prev && !op->next);
// check alignment, readonly flag and so on
if (!check_rw(op))
{
@ -1210,7 +1213,9 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
// FIXME postpone such things to set_immediate here to avoid bugs
// Set op->retry_after to retry operation after a short pause (not immediately)
if (!op->retry_after)
if (!op->retry_after && (op->retval == -EPIPE ||
op->retval == -EIO && client_eio_retry_interval ||
op->retval == -ENOSPC && client_retry_enospc))
{
op->retry_after = op->retval != -EPIPE ? client_eio_retry_interval : client_retry_interval;
}

View File

@ -56,8 +56,6 @@ struct cluster_op_t
protected:
int state = 0;
uint64_t cur_inode; // for snapshot reads
void *buf = NULL;
cluster_op_t *orig_op = NULL;
bool needs_reslice = false;
int retry_after = 0;
int inflight_count = 0, done_count = 0;

View File

@ -479,10 +479,14 @@ struct snap_merger_t
{
if (op->retval != op->len)
{
rwo->error_code = -op->retval;
rwo->error_code = op->retval;
rwo->error_offset = op->offset;
rwo->error_read = true;
}
else
{
rwo->error_code = 0;
}
continue_rwo.push_back(rwo);
parent->ringloop->wakeup();
};
@ -553,12 +557,15 @@ struct snap_merger_t
if (use_cas && subop->retval == -EINTR)
{
// CAS failure - reread and repeat optimistically
assert(rwo->todo == 1); // initial refcount from read_and_write
rwo->error_code = -EINTR;
rwo->start = rwo->end = 0;
rwo->op.version = 0;
rwo_read(rwo);
delete subop;
return;
}
rwo->error_code = -subop->retval;
rwo->error_code = subop->retval;
rwo->error_offset = subop->offset;
rwo->error_read = false;
}
@ -633,7 +640,7 @@ struct snap_merger_t
{
char buf[1024];
snprintf(buf, 1024, "Error %s target at offset %jx: %s",
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(-rwo->error_code));
rwo_error = std::string(buf);
}
delete rwo;