Fix slow & failing CAS layer merge
Test / buildenv (push) Successful in 17s Details
Test / build (push) Successful in 3m34s Details
Test / npm_lint (push) Successful in 10s Details
Test / test_cas (push) Successful in 10s Details
Test / make_test (push) Successful in 39s Details
Test / test_change_pg_size (push) Successful in 11s Details
Test / test_change_pg_count (push) Successful in 38s Details
Test / test_change_pg_count_ec (push) Successful in 37s Details
Test / test_create_nomaxid (push) Successful in 8s Details
Test / test_etcd_fail (push) Successful in 1m2s Details
Test / test_add_osd (push) Successful in 2m44s Details
Test / test_interrupted_rebalance (push) Successful in 2m1s Details
Test / test_interrupted_rebalance_imm (push) Successful in 2m2s Details
Test / test_failure_domain (push) Successful in 44s Details
Test / test_snapshot (push) Successful in 50s Details
Test / test_interrupted_rebalance_ec (push) Successful in 2m1s Details
Test / test_snapshot_ec (push) Successful in 25s Details
Test / test_minsize_1 (push) Successful in 14s Details
Test / test_move_reappear (push) Successful in 22s Details
Test / test_rm (push) Successful in 15s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m33s Details
Test / test_snapshot_down (push) Successful in 33s Details
Test / test_snapshot_down_ec (push) Successful in 31s Details
Test / test_splitbrain (push) Successful in 28s Details
Test / test_snapshot_chain (push) Successful in 2m14s Details
Test / test_snapshot_chain_ec (push) Successful in 2m56s Details
Test / test_rebalance_verify_imm (push) Successful in 2m44s Details
Test / test_root_node (push) Successful in 12s Details
Test / test_rebalance_verify (push) Successful in 3m24s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_rebalance_verify_ec (push) Successful in 3m3s Details
Test / test_write_xor (push) Successful in 44s Details
Test / test_write_no_same (push) Successful in 15s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 2m30s Details
Test / test_heal_pg_size_2 (push) Successful in 4m37s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m30s Details
Test / test_heal_ec (push) Successful in 4m45s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m8s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m39s Details
Test / test_heal_csum_32k (push) Successful in 6m42s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m30s Details
Test / test_osd_tags (push) Successful in 18s Details
Test / test_enospc (push) Successful in 1m21s Details
Test / test_enospc_imm (push) Successful in 1m13s Details
Test / test_enospc_xor (push) Successful in 2m2s Details
Test / test_scrub (push) Successful in 1m5s Details
Test / test_enospc_imm_xor (push) Successful in 1m42s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m1s Details
Test / test_heal_csum_4k (push) Successful in 6m18s Details
Test / test_scrub_xor (push) Successful in 31s Details
Test / test_nfs (push) Successful in 24s Details
Test / test_scrub_ec (push) Successful in 34s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 40s Details
Test / test_scrub_pg_size_3 (push) Successful in 43s Details
Test / test_write (push) Successful in 40s Details

fsync-after-complete
Vitaliy Filippov 2024-06-14 02:15:00 +03:00
parent 6501abc060
commit 4473eb5512
3 changed files with 21 additions and 11 deletions

View File

@ -77,11 +77,6 @@ cluster_client_t::~cluster_client_t()
cluster_op_t::~cluster_op_t()
{
if (buf)
{
free(buf);
buf = NULL;
}
if (bitmap_buf)
{
free(bitmap_buf);
@ -570,6 +565,14 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
{
op->cur_inode = op->inode;
op->retval = 0;
op->state = 0;
op->retry_after = 0;
op->inflight_count = 0;
op->done_count = 0;
op->part_bitmaps = NULL;
op->bitmap_buf_size = 0;
op->prev_wait = 0;
assert(!op->prev && !op->next);
// check alignment, readonly flag and so on
if (!check_rw(op))
{
@ -1210,7 +1213,9 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
// FIXME postpone such things to set_immediate here to avoid bugs
// Set op->retry_after to retry operation after a short pause (not immediately)
if (!op->retry_after)
if (!op->retry_after && (op->retval == -EPIPE ||
op->retval == -EIO && client_eio_retry_interval ||
op->retval == -ENOSPC && client_retry_enospc))
{
op->retry_after = op->retval != -EPIPE ? client_eio_retry_interval : client_retry_interval;
}

View File

@ -56,8 +56,6 @@ struct cluster_op_t
protected:
int state = 0;
uint64_t cur_inode; // for snapshot reads
void *buf = NULL;
cluster_op_t *orig_op = NULL;
bool needs_reslice = false;
int retry_after = 0;
int inflight_count = 0, done_count = 0;

View File

@ -479,10 +479,14 @@ struct snap_merger_t
{
if (op->retval != op->len)
{
rwo->error_code = -op->retval;
rwo->error_code = op->retval;
rwo->error_offset = op->offset;
rwo->error_read = true;
}
else
{
rwo->error_code = 0;
}
continue_rwo.push_back(rwo);
parent->ringloop->wakeup();
};
@ -553,12 +557,15 @@ struct snap_merger_t
if (use_cas && subop->retval == -EINTR)
{
// CAS failure - reread and repeat optimistically
assert(rwo->todo == 1); // initial refcount from read_and_write
rwo->error_code = -EINTR;
rwo->start = rwo->end = 0;
rwo->op.version = 0;
rwo_read(rwo);
delete subop;
return;
}
rwo->error_code = -subop->retval;
rwo->error_code = subop->retval;
rwo->error_offset = subop->offset;
rwo->error_read = false;
}
@ -633,7 +640,7 @@ struct snap_merger_t
{
char buf[1024];
snprintf(buf, 1024, "Error %s target at offset %jx: %s",
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(-rwo->error_code));
rwo_error = std::string(buf);
}
delete rwo;