Treat copied buffers as written only after completing the write in client
Test / buildenv (push) Successful in 10s Details
Test / build (push) Successful in 12s Details
Test / npm_lint (push) Successful in 10s Details
Test / test_cas (push) Successful in 10s Details
Test / make_test (push) Successful in 40s Details
Test / test_change_pg_count_ec (push) Successful in 34s Details
Test / test_change_pg_count (push) Successful in 38s Details
Test / test_change_pg_size (push) Successful in 10s Details
Test / test_create_nomaxid (push) Successful in 8s Details
Test / test_etcd_fail (push) Successful in 57s Details
Test / test_add_osd (push) Successful in 2m43s Details
Test / test_interrupted_rebalance_imm (push) Successful in 2m1s Details
Test / test_interrupted_rebalance (push) Successful in 2m34s Details
Test / test_failure_domain (push) Successful in 38s Details
Test / test_snapshot (push) Successful in 19s Details
Test / test_interrupted_rebalance_ec (push) Successful in 2m6s Details
Test / test_minsize_1 (push) Successful in 15s Details
Test / test_snapshot_ec (push) Successful in 27s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m24s Details
Test / test_move_reappear (push) Successful in 21s Details
Test / test_rm (push) Successful in 14s Details
Test / test_snapshot_down (push) Successful in 31s Details
Test / test_snapshot_down_ec (push) Successful in 32s Details
Test / test_splitbrain (push) Successful in 25s Details
Test / test_snapshot_chain (push) Successful in 2m43s Details
Test / test_snapshot_chain_ec (push) Successful in 3m18s Details
Test / test_rebalance_verify_imm (push) Successful in 3m22s Details
Test / test_root_node (push) Successful in 57s Details
Test / test_rebalance_verify (push) Successful in 5m10s Details
Test / test_switch_primary (push) Successful in 37s Details
Test / test_write (push) Successful in 55s Details
Test / test_write_xor (push) Successful in 53s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m40s Details
Test / test_write_no_same (push) Successful in 15s Details
Test / test_rebalance_verify_ec (push) Successful in 6m34s Details
Test / test_heal_pg_size_2 (push) Successful in 4m32s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m24s Details
Test / test_heal_ec (push) Successful in 4m54s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m58s Details
Test / test_heal_csum_32k (push) Successful in 9m43s Details
Test / test_heal_csum_4k_dmj (push) Successful in 9m41s Details
Test / test_osd_tags (push) Successful in 13s Details
Test / test_heal_csum_4k_dj (push) Successful in 10m0s Details
Test / test_enospc (push) Successful in 44s Details
Test / test_enospc_xor (push) Successful in 55s Details
Test / test_enospc_imm (push) Successful in 44s Details
Test / test_scrub (push) Successful in 31s Details
Test / test_enospc_imm_xor (push) Successful in 57s Details
Test / test_scrub_zero_osd_2 (push) Successful in 23s Details
Test / test_scrub_xor (push) Successful in 27s Details
Test / test_scrub_pg_size_3 (push) Successful in 46s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 40s Details
Test / test_scrub_ec (push) Successful in 25s Details
Test / test_nfs (push) Successful in 14s Details
Test / test_heal_csum_4k (push) Successful in 9m27s Details

SYNC operation fsyncs only completed operations, so treating writes as "eligible
for fsync" before actually completing them is incorrect

It affected SCHEME=ec test_heal.sh (with immediate_commit=none) test - it was
flapping with lost writes - some non-fsynced writes were legitimately lost by
the OSD, but weren't repeated by the client
msgr-iothreads
Vitaliy Filippov 2024-06-19 23:47:39 +03:00
parent e400a851f4
commit ea0d72289c
5 changed files with 34 additions and 22 deletions

View File

@ -617,7 +617,9 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
{
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
{
wb->copy_write(op, CACHE_WRITTEN);
uint64_t flush_id = ++wb->last_flush_id;
wb->copy_write(op, CACHE_REPEATING, flush_id);
op->flush_id = flush_id;
}
if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
{
@ -833,6 +835,10 @@ resume_2:
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
op->retval = op->len / pool_cfg.bitmap_granularity;
}
if (op->flush_id)
{
wb->mark_flush_written(op->inode, op->offset, op->len, op->flush_id);
}
erase_op(op);
return 1;
}

View File

@ -64,6 +64,7 @@ protected:
unsigned bitmap_buf_size = 0;
cluster_op_t *prev = NULL, *next = NULL;
int prev_wait = 0;
uint64_t flush_id = 0;
friend class cluster_client_t;
friend class writeback_cache_t;
};

View File

@ -46,11 +46,12 @@ public:
bool is_left_merged(dirty_buf_it_t dirty_it);
bool is_right_merged(dirty_buf_it_t dirty_it);
bool is_merged(const dirty_buf_it_t & dirty_it);
void copy_write(cluster_op_t *op, int state);
void copy_write(cluster_op_t *op, int state, uint64_t new_flush_id = 0);
int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd, pool_id_t pool_id, pg_num_t pg_num);
void start_writebacks(cluster_client_t *cli, int count);
bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
void mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
void fsync_start();
void fsync_error();
void fsync_ok();

View File

@ -71,7 +71,7 @@ bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
return is_left_merged(dirty_it) || is_right_merged(dirty_it);
}
void writeback_cache_t::copy_write(cluster_op_t *op, int state)
void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flush_id)
{
// Save operation for replay when one of PGs goes out of sync
// (primary OSD drops our connection in this case)
@ -180,6 +180,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state)
.buf = buf,
.len = op->len,
.state = state,
.flush_id = new_flush_id,
.refcnt = refcnt,
});
if (state == CACHE_DIRTY)
@ -268,7 +269,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
writebacks_active++;
op->callback = [this, flush_id](cluster_op_t* op)
{
// Buffer flushes should be always retried, regardless of the error,
// Buffer flushes are always retried, regardless of the error,
// so they should never result in an error here
assert(op->retval == op->len);
for (auto fl_it = flushed_buffers.find(flush_id);
@ -280,16 +281,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
}
flushed_buffers.erase(fl_it++);
}
for (auto dirty_it = find_dirty(op->inode, op->offset);
dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
dirty_it->first.stripe < op->offset+op->len; dirty_it++)
{
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
{
dirty_it->second.flush_id = 0;
dirty_it->second.state = CACHE_WRITTEN;
}
}
mark_flush_written(op->inode, op->offset, op->len, flush_id);
delete op;
writebacks_active--;
// We can't call execute_internal because it affects an invalid copy of the list here
@ -307,6 +299,20 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
}
}
void writeback_cache_t::mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
{
for (auto dirty_it = find_dirty(inode, offset);
dirty_it != dirty_buffers.end() && dirty_it->first.inode == inode &&
dirty_it->first.stripe < offset+len; dirty_it++)
{
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
{
dirty_it->second.flush_id = 0;
dirty_it->second.state = CACHE_WRITTEN;
}
}
}
void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
{
if (!writeback_queue.size())

View File

@ -43,8 +43,7 @@ void configure_single_pg_pool(cluster_client_t *cli)
},
});
cli->st_cli.on_load_pgs_hook(true);
std::map<std::string, etcd_kv_t> changes;
cli->st_cli.on_change_hook(changes);
cli->st_cli.on_change_pool_config_hook();
}
int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c, std::function<void()> cb = NULL, bool instant = false)
@ -281,7 +280,8 @@ void test1()
uint8_t c = offset < 0xE000 ? 0x56 : (offset < 0x10000 ? 0x57 : 0x58);
if (((uint8_t*)op->iov.buf[buf_idx].iov_base)[i] != c)
{
printf("Write replay: mismatch at %ju\n", offset-op->req.rw.offset);
printf("Write replay: mismatch at %ju (expected %02x, have %02x)\n", offset-op->req.rw.offset,
c, ((uint8_t*)op->iov.buf[buf_idx].iov_base)[i]);
goto fail;
}
}
@ -290,9 +290,9 @@ void test1()
assert(offset == op->req.rw.offset+op->req.rw.len);
replay_ops.push_back(op);
}
if (replay_start != 0 || replay_end != 0x14000)
if (replay_start != 0 || replay_end != 0x10000)
{
printf("Write replay: range mismatch: %jx-%jx\n", replay_start, replay_end);
printf("Write replay: range mismatch: 0x%jx-0x%jx (expected 0-0x10000)\n", replay_start, replay_end);
assert(0);
}
for (auto op: replay_ops)
@ -320,8 +320,6 @@ void test1()
check_disconnected(cli, 1);
pretend_connected(cli, 1);
check_op_count(cli, 1, 1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
check_op_count(cli, 1, 1);
can_complete(r1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
check_completed(r1);
@ -341,7 +339,7 @@ void test1()
pretend_connected(cli, 1);
cli->continue_ops(true);
check_op_count(cli, 1, 1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x2000), 0);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
check_op_count(cli, 1, 1);
can_complete(r2);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x1000, 0x1000), 0);