|
|
|
@ -4,8 +4,10 @@
|
|
|
|
|
#define SYNC_HAS_BIG 2
|
|
|
|
|
#define SYNC_DATA_SYNC_SENT 3
|
|
|
|
|
#define SYNC_DATA_SYNC_DONE 4
|
|
|
|
|
#define SYNC_JOURNAL_SYNC_SENT 5
|
|
|
|
|
#define SYNC_DONE 6
|
|
|
|
|
#define SYNC_JOURNAL_WRITE_SENT 5
|
|
|
|
|
#define SYNC_JOURNAL_WRITE_DONE 6
|
|
|
|
|
#define SYNC_JOURNAL_SYNC_SENT 7
|
|
|
|
|
#define SYNC_DONE 8
|
|
|
|
|
|
|
|
|
|
int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
|
|
|
|
|
{
|
|
|
|
@ -14,23 +16,25 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
|
|
|
|
|
stop_sync_submitted = false;
|
|
|
|
|
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
|
|
|
|
|
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
|
|
|
|
|
unsynced_big_writes.clear();
|
|
|
|
|
unsynced_small_writes.clear();
|
|
|
|
|
if (PRIV(op)->sync_big_writes.size() > 0)
|
|
|
|
|
PRIV(op)->sync_state = SYNC_HAS_BIG;
|
|
|
|
|
else if (PRIV(op)->sync_small_writes.size() > 0)
|
|
|
|
|
PRIV(op)->sync_state = SYNC_HAS_SMALL;
|
|
|
|
|
else
|
|
|
|
|
PRIV(op)->sync_state = SYNC_DONE;
|
|
|
|
|
unsynced_big_writes.clear();
|
|
|
|
|
unsynced_small_writes.clear();
|
|
|
|
|
// Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
|
|
|
|
|
PRIV(op)->prev_sync_count = in_progress_syncs.size();
|
|
|
|
|
PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
|
|
|
|
|
}
|
|
|
|
|
int r = continue_sync(op);
|
|
|
|
|
if (r)
|
|
|
|
|
{
|
|
|
|
|
PRIV(op)->prev_sync_count = in_progress_syncs.size();
|
|
|
|
|
PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
|
|
|
|
|
ack_sync(op);
|
|
|
|
|
}
|
|
|
|
|
return r;
|
|
|
|
|
// Always dequeue because we always add syncs to in_progress_syncs
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|
|
|
@ -39,51 +43,42 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|
|
|
|
if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
|
|
|
|
|
{
|
|
|
|
|
// No big writes, just fsync the journal
|
|
|
|
|
int n_sqes = disable_journal_fsync ? 0 : 1;
|
|
|
|
|
if (journal.sector_info[journal.cur_sector].dirty)
|
|
|
|
|
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
|
|
|
|
|
{
|
|
|
|
|
n_sqes++;
|
|
|
|
|
}
|
|
|
|
|
if (n_sqes > 0)
|
|
|
|
|
{
|
|
|
|
|
io_uring_sqe* sqes[n_sqes];
|
|
|
|
|
for (int i = 0; i < n_sqes; i++)
|
|
|
|
|
{
|
|
|
|
|
BS_SUBMIT_GET_SQE_DECL(sqes[i]);
|
|
|
|
|
}
|
|
|
|
|
int s = 0;
|
|
|
|
|
if (journal.sector_info[journal.cur_sector].dirty)
|
|
|
|
|
{
|
|
|
|
|
prepare_journal_sector_write(journal, journal.cur_sector, sqes[s++], cb);
|
|
|
|
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
|
|
|
|
|
{
|
|
|
|
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
|
|
|
|
// Wait for small inflight writes to complete
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
if (!disable_journal_fsync)
|
|
|
|
|
{
|
|
|
|
|
// FIXME: Wait for completion of writes before issuing an fsync
|
|
|
|
|
// Fsync and write requests posted at the same time can be reordered
|
|
|
|
|
ring_data_t *data = ((ring_data_t*)sqes[s]->user_data);
|
|
|
|
|
my_uring_prep_fsync(sqes[s++], journal.fd, IORING_FSYNC_DATASYNC);
|
|
|
|
|
data->iov = { 0 };
|
|
|
|
|
data->callback = cb;
|
|
|
|
|
}
|
|
|
|
|
PRIV(op)->pending_ops = s;
|
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
|
|
|
|
|
}
|
|
|
|
|
if (journal.sector_info[journal.cur_sector].dirty)
|
|
|
|
|
{
|
|
|
|
|
// Write out the last journal sector if it happens to be dirty
|
|
|
|
|
BS_SUBMIT_GET_ONLY_SQE(sqe);
|
|
|
|
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
|
|
|
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
|
|
|
|
PRIV(op)->pending_ops = 1;
|
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PRIV(op)->sync_state = SYNC_DONE;
|
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if (PRIV(op)->sync_state == SYNC_HAS_BIG)
|
|
|
|
|
if (PRIV(op)->sync_state == SYNC_HAS_BIG)
|
|
|
|
|
{
|
|
|
|
|
for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
|
|
|
|
|
{
|
|
|
|
|
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
|
|
|
|
|
{
|
|
|
|
|
// Wait for big inflight writes to complete
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// 1st step: fsync data
|
|
|
|
|
if (!disable_data_fsync)
|
|
|
|
|
{
|
|
|
|
|
// FIXME Wait for completion of writes before issuing an fsync
|
|
|
|
|
BS_SUBMIT_GET_SQE(sqe, data);
|
|
|
|
|
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
|
|
|
|
|
data->iov = { 0 };
|
|
|
|
@ -91,6 +86,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|
|
|
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
|
|
|
|
PRIV(op)->pending_ops = 1;
|
|
|
|
|
PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
@ -99,6 +95,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|
|
|
|
}
|
|
|
|
|
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE)
|
|
|
|
|
{
|
|
|
|
|
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
|
|
|
|
|
{
|
|
|
|
|
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
|
|
|
|
|
{
|
|
|
|
|
// Wait for small inflight writes to complete
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// 2nd step: Data device is synced, prepare & write journal entries
|
|
|
|
|
// Check space in the journal and journal memory buffers
|
|
|
|
|
blockstore_journal_check_t space_check(this);
|
|
|
|
@ -107,8 +111,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
// Get SQEs. Don't bother about merging, submit each journal sector as a separate request
|
|
|
|
|
struct io_uring_sqe *sqe[space_check.sectors_required + (disable_journal_fsync ? 0 : 1)];
|
|
|
|
|
for (int i = 0; i < space_check.sectors_required + (disable_journal_fsync ? 0 : 1); i++)
|
|
|
|
|
struct io_uring_sqe *sqe[space_check.sectors_required];
|
|
|
|
|
for (int i = 0; i < space_check.sectors_required; i++)
|
|
|
|
|
{
|
|
|
|
|
BS_SUBMIT_GET_SQE_DECL(sqe[i]);
|
|
|
|
|
}
|
|
|
|
@ -150,22 +154,26 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
|
|
|
|
// ... And a journal fsync
|
|
|
|
|
PRIV(op)->pending_ops = s;
|
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE)
|
|
|
|
|
{
|
|
|
|
|
if (!disable_journal_fsync)
|
|
|
|
|
{
|
|
|
|
|
// FIXME Wait for completion of writes before issuing an fsync
|
|
|
|
|
my_uring_prep_fsync(sqe[s], journal.fd, IORING_FSYNC_DATASYNC);
|
|
|
|
|
struct ring_data_t *data = ((ring_data_t*)sqe[s]->user_data);
|
|
|
|
|
BS_SUBMIT_GET_SQE(sqe, data);
|
|
|
|
|
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
|
|
|
|
|
data->iov = { 0 };
|
|
|
|
|
data->callback = cb;
|
|
|
|
|
PRIV(op)->pending_ops = 1 + s;
|
|
|
|
|
PRIV(op)->pending_ops = 1;
|
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PRIV(op)->pending_ops = s;
|
|
|
|
|
PRIV(op)->sync_state = SYNC_DONE;
|
|
|
|
|
}
|
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
|
|
|
|
|
ringloop->submit();
|
|
|
|
|
}
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
@ -190,6 +198,10 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
|
|
|
|
|
{
|
|
|
|
|
PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
|
|
|
|
|
}
|
|
|
|
|
else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT)
|
|
|
|
|
{
|
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
|
|
|
|
|
}
|
|
|
|
|
else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT)
|
|
|
|
|
{
|
|
|
|
|
PRIV(op)->sync_state = SYNC_DONE;
|
|
|
|
|