2019-12-15 14:49:10 +03:00
|
|
|
#include "blockstore_impl.h"
|
2019-11-10 01:40:48 +03:00
|
|
|
|
2019-11-10 13:49:26 +03:00
|
|
|
#define SYNC_HAS_SMALL 1
|
2019-11-10 01:40:48 +03:00
|
|
|
#define SYNC_HAS_BIG 2
|
|
|
|
#define SYNC_DATA_SYNC_SENT 3
|
2019-11-10 12:46:58 +03:00
|
|
|
#define SYNC_DATA_SYNC_DONE 4
|
2020-01-29 16:40:11 +03:00
|
|
|
#define SYNC_JOURNAL_WRITE_SENT 5
|
|
|
|
#define SYNC_JOURNAL_WRITE_DONE 6
|
|
|
|
#define SYNC_JOURNAL_SYNC_SENT 7
|
|
|
|
#define SYNC_DONE 8
|
2019-11-10 01:40:48 +03:00
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-12-15 14:49:10 +03:00
|
|
|
if (PRIV(op)->sync_state == 0)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-11-25 01:29:07 +03:00
|
|
|
stop_sync_submitted = false;
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
|
|
|
|
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
|
2020-01-29 16:50:35 +03:00
|
|
|
PRIV(op)->sync_small_checked = 0;
|
|
|
|
PRIV(op)->sync_big_checked = 0;
|
2020-01-29 16:40:11 +03:00
|
|
|
unsynced_big_writes.clear();
|
|
|
|
unsynced_small_writes.clear();
|
2019-12-15 14:49:10 +03:00
|
|
|
if (PRIV(op)->sync_big_writes.size() > 0)
|
|
|
|
PRIV(op)->sync_state = SYNC_HAS_BIG;
|
|
|
|
else if (PRIV(op)->sync_small_writes.size() > 0)
|
|
|
|
PRIV(op)->sync_state = SYNC_HAS_SMALL;
|
2019-11-10 12:46:58 +03:00
|
|
|
else
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->sync_state = SYNC_DONE;
|
2020-01-29 16:40:11 +03:00
|
|
|
// Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
|
|
|
|
PRIV(op)->prev_sync_count = in_progress_syncs.size();
|
|
|
|
PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
|
2019-11-10 12:46:58 +03:00
|
|
|
}
|
2020-01-29 16:50:35 +03:00
|
|
|
continue_sync(op);
|
2020-01-29 16:40:11 +03:00
|
|
|
// Always dequeue because we always add syncs to in_progress_syncs
|
|
|
|
return 1;
|
2019-11-10 12:46:58 +03:00
|
|
|
}
|
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
2019-11-10 12:46:58 +03:00
|
|
|
{
|
2019-11-13 21:17:04 +03:00
|
|
|
auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
|
2019-12-15 14:49:10 +03:00
|
|
|
if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
|
|
|
// No big writes, just fsync the journal
|
2020-01-29 16:40:11 +03:00
|
|
|
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
|
2019-11-30 23:55:30 +03:00
|
|
|
{
|
2020-01-29 16:40:11 +03:00
|
|
|
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
|
2020-01-15 01:55:30 +03:00
|
|
|
{
|
2020-01-29 16:40:11 +03:00
|
|
|
// Wait for small inflight writes to complete
|
|
|
|
return 0;
|
2020-01-15 01:55:30 +03:00
|
|
|
}
|
2020-01-29 16:40:11 +03:00
|
|
|
}
|
|
|
|
if (journal.sector_info[journal.cur_sector].dirty)
|
|
|
|
{
|
|
|
|
// Write out the last journal sector if it happens to be dirty
|
|
|
|
BS_SUBMIT_GET_ONLY_SQE(sqe);
|
|
|
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
|
|
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
|
|
|
PRIV(op)->pending_ops = 1;
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
|
|
|
|
return 1;
|
2019-11-30 23:55:30 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-01-29 16:40:11 +03:00
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
|
2019-11-30 23:55:30 +03:00
|
|
|
}
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2020-01-29 16:40:11 +03:00
|
|
|
if (PRIV(op)->sync_state == SYNC_HAS_BIG)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2020-01-29 16:40:11 +03:00
|
|
|
for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
|
|
|
|
{
|
|
|
|
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
|
|
|
|
{
|
|
|
|
// Wait for big inflight writes to complete
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2019-11-10 01:40:48 +03:00
|
|
|
// 1st step: fsync data
|
2020-01-17 13:40:47 +03:00
|
|
|
if (!disable_data_fsync)
|
2019-11-30 23:55:30 +03:00
|
|
|
{
|
|
|
|
BS_SUBMIT_GET_SQE(sqe, data);
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_fsync(sqe, data_fd_index, IORING_FSYNC_DATASYNC);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2019-11-30 23:55:30 +03:00
|
|
|
data->iov = { 0 };
|
|
|
|
data->callback = cb;
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
|
|
|
PRIV(op)->pending_ops = 1;
|
|
|
|
PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT;
|
2020-01-29 16:40:11 +03:00
|
|
|
return 1;
|
2019-11-30 23:55:30 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
|
2019-11-30 23:55:30 +03:00
|
|
|
}
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2019-12-15 14:49:10 +03:00
|
|
|
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2020-01-29 16:40:11 +03:00
|
|
|
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
|
|
|
|
{
|
|
|
|
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
|
|
|
|
{
|
|
|
|
// Wait for small inflight writes to complete
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2019-11-10 01:40:48 +03:00
|
|
|
// 2nd step: Data device is synced, prepare & write journal entries
|
|
|
|
// Check space in the journal and journal memory buffers
|
2019-11-11 00:28:14 +03:00
|
|
|
blockstore_journal_check_t space_check(this);
|
2019-12-15 14:49:10 +03:00
|
|
|
if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), sizeof(journal_entry_big_write), 0))
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// Get SQEs. Don't bother about merging, submit each journal sector as a separate request
|
2020-01-29 16:40:11 +03:00
|
|
|
struct io_uring_sqe *sqe[space_check.sectors_required];
|
|
|
|
for (int i = 0; i < space_check.sectors_required; i++)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
|
|
|
BS_SUBMIT_GET_SQE_DECL(sqe[i]);
|
|
|
|
}
|
|
|
|
// Prepare and submit journal entries
|
2019-12-15 14:49:10 +03:00
|
|
|
auto it = PRIV(op)->sync_big_writes.begin();
|
2019-11-11 00:28:14 +03:00
|
|
|
int s = 0, cur_sector = -1;
|
2020-01-16 00:35:35 +03:00
|
|
|
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
|
2020-01-15 01:55:30 +03:00
|
|
|
journal.sector_info[journal.cur_sector].dirty)
|
|
|
|
{
|
|
|
|
if (cur_sector == -1)
|
|
|
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
|
|
|
cur_sector = journal.cur_sector;
|
|
|
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
|
|
|
}
|
2019-12-15 14:49:10 +03:00
|
|
|
while (it != PRIV(op)->sync_big_writes.end())
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-11-11 00:28:14 +03:00
|
|
|
journal_entry_big_write *je = (journal_entry_big_write*)
|
|
|
|
prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
|
2019-11-14 21:15:59 +03:00
|
|
|
dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
|
2020-01-15 01:55:30 +03:00
|
|
|
journal.sector_info[journal.cur_sector].dirty = false;
|
2019-11-14 21:15:59 +03:00
|
|
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
2019-11-27 18:07:08 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
|
|
|
|
#endif
|
2019-11-11 00:28:14 +03:00
|
|
|
je->oid = it->oid;
|
|
|
|
je->version = it->version;
|
2020-01-12 19:48:03 +03:00
|
|
|
je->offset = dirty_db[*it].offset;
|
|
|
|
je->len = dirty_db[*it].len;
|
2019-11-12 20:55:17 +03:00
|
|
|
je->location = dirty_db[*it].location;
|
2019-11-11 00:28:14 +03:00
|
|
|
je->crc32 = je_crc32((journal_entry*)je);
|
|
|
|
journal.crc32_last = je->crc32;
|
|
|
|
it++;
|
|
|
|
if (cur_sector != journal.cur_sector)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-11-11 18:24:04 +03:00
|
|
|
if (cur_sector == -1)
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
2019-11-11 00:28:14 +03:00
|
|
|
cur_sector = journal.cur_sector;
|
2020-01-15 01:55:30 +03:00
|
|
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
|
|
|
}
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
2020-01-29 16:40:11 +03:00
|
|
|
PRIV(op)->pending_ops = s;
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE)
|
|
|
|
{
|
2020-01-17 13:40:47 +03:00
|
|
|
if (!disable_journal_fsync)
|
2019-11-30 23:55:30 +03:00
|
|
|
{
|
2020-01-29 16:40:11 +03:00
|
|
|
BS_SUBMIT_GET_SQE(sqe, data);
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_fsync(sqe, journal_fd_index, IORING_FSYNC_DATASYNC);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2019-11-30 23:55:30 +03:00
|
|
|
data->iov = { 0 };
|
|
|
|
data->callback = cb;
|
2020-01-29 16:40:11 +03:00
|
|
|
PRIV(op)->pending_ops = 1;
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
|
|
|
|
return 1;
|
2019-11-30 23:55:30 +03:00
|
|
|
}
|
|
|
|
else
|
2020-01-17 13:40:47 +03:00
|
|
|
{
|
2020-01-29 16:40:11 +03:00
|
|
|
PRIV(op)->sync_state = SYNC_DONE;
|
2020-01-17 13:40:47 +03:00
|
|
|
}
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2020-02-28 01:46:39 +03:00
|
|
|
if (PRIV(op)->sync_state == SYNC_DONE)
|
|
|
|
{
|
|
|
|
ack_sync(op);
|
|
|
|
}
|
2019-11-10 01:40:48 +03:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2020-01-10 01:23:46 +03:00
|
|
|
live = true;
|
2019-11-21 22:04:44 +03:00
|
|
|
if (data->res != data->iov.iov_len)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-11-17 22:27:29 +03:00
|
|
|
throw std::runtime_error(
|
|
|
|
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
|
|
|
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
|
|
|
|
);
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->pending_ops--;
|
|
|
|
if (PRIV(op)->pending_ops == 0)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-11-10 12:46:58 +03:00
|
|
|
// Release used journal sectors
|
2020-01-15 01:55:30 +03:00
|
|
|
release_journal_sectors(op);
|
2019-11-10 13:49:26 +03:00
|
|
|
// Handle states
|
2019-12-15 14:49:10 +03:00
|
|
|
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2020-01-29 16:40:11 +03:00
|
|
|
else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT)
|
|
|
|
{
|
|
|
|
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
|
|
|
|
}
|
2019-12-15 14:49:10 +03:00
|
|
|
else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT)
|
2019-11-10 12:46:58 +03:00
|
|
|
{
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->sync_state = SYNC_DONE;
|
2019-11-19 18:07:40 +03:00
|
|
|
ack_sync(op);
|
2019-11-10 12:46:58 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-11-16 02:12:27 +03:00
|
|
|
throw std::runtime_error("BUG: unexpected sync op state");
|
2019-11-10 12:46:58 +03:00
|
|
|
}
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2019-11-10 12:46:58 +03:00
|
|
|
}
|
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
int blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
2019-11-10 12:46:58 +03:00
|
|
|
{
|
2019-12-15 14:49:10 +03:00
|
|
|
if (PRIV(op)->sync_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
2019-11-10 12:46:58 +03:00
|
|
|
// Remove dependency of subsequent syncs
|
2019-12-15 14:49:10 +03:00
|
|
|
auto it = PRIV(op)->in_progress_ptr;
|
2019-11-10 01:40:48 +03:00
|
|
|
int done_syncs = 1;
|
|
|
|
++it;
|
2019-11-27 18:04:52 +03:00
|
|
|
// Acknowledge sync
|
|
|
|
ack_one_sync(op);
|
2019-11-10 01:40:48 +03:00
|
|
|
while (it != in_progress_syncs.end())
|
|
|
|
{
|
2019-11-10 12:46:58 +03:00
|
|
|
auto & next_sync = *it++;
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(next_sync)->prev_sync_count -= done_syncs;
|
|
|
|
if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->sync_state == SYNC_DONE)
|
2019-11-10 01:40:48 +03:00
|
|
|
{
|
|
|
|
done_syncs++;
|
2019-11-10 12:46:58 +03:00
|
|
|
// Acknowledge next_sync
|
2019-11-27 18:04:52 +03:00
|
|
|
ack_one_sync(next_sync);
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
|
|
|
}
|
2019-11-10 12:46:58 +03:00
|
|
|
return 1;
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2019-11-10 12:46:58 +03:00
|
|
|
return 0;
|
2019-11-10 01:40:48 +03:00
|
|
|
}
|
2019-11-27 18:04:52 +03:00
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
|
2019-11-27 18:04:52 +03:00
|
|
|
{
|
|
|
|
// Handle states
|
2019-12-15 14:49:10 +03:00
|
|
|
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
|
2019-11-27 18:04:52 +03:00
|
|
|
{
|
2019-11-27 18:07:08 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("Ack sync big %lu:%lu v%lu\n", it->oid.inode, it->oid.stripe, it->version);
|
|
|
|
#endif
|
2019-11-27 18:04:52 +03:00
|
|
|
auto & unstab = unstable_writes[it->oid];
|
|
|
|
unstab = unstab < it->version ? it->version : unstab;
|
|
|
|
dirty_db[*it].state = ST_D_META_SYNCED;
|
|
|
|
}
|
2019-12-15 14:49:10 +03:00
|
|
|
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
2019-11-27 18:04:52 +03:00
|
|
|
{
|
2019-11-27 18:07:08 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("Ack sync small %lu:%lu v%lu\n", it->oid.inode, it->oid.stripe, it->version);
|
|
|
|
#endif
|
2019-11-27 18:04:52 +03:00
|
|
|
auto & unstab = unstable_writes[it->oid];
|
|
|
|
unstab = unstab < it->version ? it->version : unstab;
|
2019-12-01 17:25:59 +03:00
|
|
|
dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
|
2019-11-27 18:04:52 +03:00
|
|
|
}
|
2019-12-15 14:49:10 +03:00
|
|
|
in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
|
2019-11-27 18:04:52 +03:00
|
|
|
op->retval = 0;
|
2019-12-15 14:49:10 +03:00
|
|
|
FINISH_OP(op);
|
2019-11-27 18:04:52 +03:00
|
|
|
}
|