Make fsync flags separate for data, metadata and journal

blocking-uring-test
Vitaliy Filippov 2020-01-17 13:40:47 +03:00
parent d5386aa958
commit d0ab2a20b2
5 changed files with 26 additions and 13 deletions

View File

@ -635,14 +635,16 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
goto resume_1; goto resume_1;
else if (wait_state == wait_base+2) else if (wait_state == wait_base+2)
goto resume_2; goto resume_2;
if (!bs->disable_fsync) if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_journal_fsync))
{ {
cur_sync = flusher->syncs.end(); cur_sync = flusher->syncs.end();
while (cur_sync != flusher->syncs.begin()) while (cur_sync != flusher->syncs.begin())
{ {
cur_sync--; cur_sync--;
if (cur_sync->fsync_meta == fsync_meta && cur_sync->state == 0) if (cur_sync->fsync_meta == fsync_meta && cur_sync->state == 0)
{
goto sync_found; goto sync_found;
}
} }
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){ cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){
.fsync_meta = fsync_meta, .fsync_meta = fsync_meta,

View File

@ -194,9 +194,8 @@ class blockstore_impl_t
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096; uint64_t bitmap_granularity = 4096;
bool readonly = false; bool readonly = false;
// FIXME: separate flags for data, metadata and journal
// It is safe to disable fsync() if drive write cache is writethrough // It is safe to disable fsync() if drive write cache is writethrough
bool disable_fsync = false; bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
bool inmemory_meta = false; bool inmemory_meta = false;
int flusher_count; int flusher_count;
/******* END OF OPTIONS *******/ /******* END OF OPTIONS *******/

View File

@ -251,7 +251,7 @@ resume_1:
data->callback = simple_callback; data->callback = simple_callback;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
wait_count++; wait_count++;
if (!bs->disable_fsync) if (!bs->disable_journal_fsync)
{ {
GET_SQE(); GET_SQE();
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
@ -331,7 +331,7 @@ resume_1:
data->callback = simple_callback; data->callback = simple_callback;
wait_count++; wait_count++;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector); my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector);
if (!bs->disable_fsync) if (!bs->disable_journal_fsync)
{ {
GET_SQE(); GET_SQE();
data->iov = { 0 }; data->iov = { 0 };

View File

@ -22,9 +22,17 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{ {
readonly = true; readonly = true;
} }
if (config["disable_fsync"] == "true" || config["disable_fsync"] == "1" || config["disable_fsync"] == "yes") if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
{ {
disable_fsync = true; disable_data_fsync = true;
}
if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
{
disable_meta_fsync = true;
}
if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
{
disable_journal_fsync = true;
} }
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10); metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10); cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
@ -265,6 +273,7 @@ void blockstore_impl_t::open_meta()
else else
{ {
meta_fd = data_fd; meta_fd = data_fd;
disable_meta_fsync = disable_data_fsync;
meta_size = 0; meta_size = 0;
if (meta_offset >= data_size) if (meta_offset >= data_size)
{ {
@ -287,6 +296,7 @@ void blockstore_impl_t::open_journal()
else else
{ {
journal.fd = meta_fd; journal.fd = meta_fd;
disable_journal_fsync = disable_meta_fsync;
journal.device_size = 0; journal.device_size = 0;
if (journal.offset >= data_size) if (journal.offset >= data_size)
{ {

View File

@ -39,7 +39,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
if (PRIV(op)->sync_state == SYNC_HAS_SMALL) if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
{ {
// No big writes, just fsync the journal // No big writes, just fsync the journal
int n_sqes = disable_fsync ? 0 : 1; int n_sqes = disable_journal_fsync ? 0 : 1;
if (journal.sector_info[journal.cur_sector].dirty) if (journal.sector_info[journal.cur_sector].dirty)
{ {
n_sqes++; n_sqes++;
@ -61,7 +61,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
{ {
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0; PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
} }
if (!disable_fsync) if (!disable_journal_fsync)
{ {
ring_data_t *data = ((ring_data_t*)sqes[s]->user_data); ring_data_t *data = ((ring_data_t*)sqes[s]->user_data);
my_uring_prep_fsync(sqes[s++], journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqes[s++], journal.fd, IORING_FSYNC_DATASYNC);
@ -79,7 +79,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
else if (PRIV(op)->sync_state == SYNC_HAS_BIG) else if (PRIV(op)->sync_state == SYNC_HAS_BIG)
{ {
// 1st step: fsync data // 1st step: fsync data
if (!disable_fsync) if (!disable_data_fsync)
{ {
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
@ -104,8 +104,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
return 0; return 0;
} }
// Get SQEs. Don't bother about merging, submit each journal sector as a separate request // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
struct io_uring_sqe *sqe[space_check.sectors_required + (disable_fsync ? 0 : 1)]; struct io_uring_sqe *sqe[space_check.sectors_required + (disable_journal_fsync ? 0 : 1)];
for (int i = 0; i < space_check.sectors_required + (disable_fsync ? 0 : 1); i++) for (int i = 0; i < space_check.sectors_required + (disable_journal_fsync ? 0 : 1); i++)
{ {
BS_SUBMIT_GET_SQE_DECL(sqe[i]); BS_SUBMIT_GET_SQE_DECL(sqe[i]);
} }
@ -148,7 +148,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
} }
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector; PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
// ... And a journal fsync // ... And a journal fsync
if (!disable_fsync) if (!disable_journal_fsync)
{ {
my_uring_prep_fsync(sqe[s], journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe[s], journal.fd, IORING_FSYNC_DATASYNC);
struct ring_data_t *data = ((ring_data_t*)sqe[s]->user_data); struct ring_data_t *data = ((ring_data_t*)sqe[s]->user_data);
@ -157,7 +157,9 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
PRIV(op)->pending_ops = 1 + s; PRIV(op)->pending_ops = 1 + s;
} }
else else
{
PRIV(op)->pending_ops = s; PRIV(op)->pending_ops = s;
}
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT; PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
ringloop->submit(); ringloop->submit();
} }