forked from vitalif/vitastor
Fix journal trimming
1) Update journal's used_start in memory only after updating journal superblock. Doing the opposite is incorrect because part of the journal will be lost if writers overwrite its old beginning. 2) Sync journal device after updating the superblock. 3) Do not trim in rollback and init because trimming there would also require updating the superblock. And the only reason to trim in both those places was to unblock writers. And a guaranteed unblocking method will follow in the next commit :)
parent
99c45bb5ed
commit
5fbe36198a
|
@ -8,10 +8,12 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
|
||||||
this->bs = bs;
|
this->bs = bs;
|
||||||
this->flusher_count = flusher_count;
|
this->flusher_count = flusher_count;
|
||||||
dequeuing = false;
|
dequeuing = false;
|
||||||
|
trimming = false;
|
||||||
active_flushers = 0;
|
active_flushers = 0;
|
||||||
syncing_flushers = 0;
|
syncing_flushers = 0;
|
||||||
|
// FIXME: allow to configure flusher_start_threshold and journal_trim_interval
|
||||||
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
|
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
|
||||||
journal_trim_interval = flusher_start_threshold;
|
journal_trim_interval = 512;
|
||||||
journal_trim_counter = 0;
|
journal_trim_counter = 0;
|
||||||
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
|
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
|
||||||
co = new journal_flusher_co[flusher_count];
|
co = new journal_flusher_co[flusher_count];
|
||||||
|
@ -172,6 +174,12 @@ bool journal_flusher_co::loop()
|
||||||
goto resume_17;
|
goto resume_17;
|
||||||
else if (wait_state == 18)
|
else if (wait_state == 18)
|
||||||
goto resume_18;
|
goto resume_18;
|
||||||
|
else if (wait_state == 19)
|
||||||
|
goto resume_19;
|
||||||
|
else if (wait_state == 20)
|
||||||
|
goto resume_20;
|
||||||
|
else if (wait_state == 21)
|
||||||
|
goto resume_21;
|
||||||
resume_0:
|
resume_0:
|
||||||
if (!flusher->flush_queue.size() || !flusher->dequeuing)
|
if (!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||||
{
|
{
|
||||||
|
@ -484,9 +492,18 @@ resume_1:
|
||||||
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
|
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
|
||||||
{
|
{
|
||||||
flusher->journal_trim_counter = 0;
|
flusher->journal_trim_counter = 0;
|
||||||
if (bs->journal.trim())
|
new_trim_pos = bs->journal.get_trim_pos();
|
||||||
|
if (new_trim_pos != bs->journal.used_start)
|
||||||
{
|
{
|
||||||
// Update journal "superblock"
|
resume_19:
|
||||||
|
// Wait for other coroutines trimming the journal, if any
|
||||||
|
if (flusher->trimming)
|
||||||
|
{
|
||||||
|
wait_state = 19;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
flusher->trimming = true;
|
||||||
|
// First update journal "superblock" and only then update <used_start> in memory
|
||||||
await_sqe(12);
|
await_sqe(12);
|
||||||
*((journal_entry_start*)flusher->journal_superblock) = {
|
*((journal_entry_start*)flusher->journal_superblock) = {
|
||||||
.crc32 = 0,
|
.crc32 = 0,
|
||||||
|
@ -494,7 +511,7 @@ resume_1:
|
||||||
.type = JE_START,
|
.type = JE_START,
|
||||||
.size = sizeof(journal_entry_start),
|
.size = sizeof(journal_entry_start),
|
||||||
.reserved = 0,
|
.reserved = 0,
|
||||||
.journal_start = bs->journal.used_start,
|
.journal_start = new_trim_pos,
|
||||||
};
|
};
|
||||||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
||||||
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
|
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
|
||||||
|
@ -507,6 +524,24 @@ resume_1:
|
||||||
wait_state = 13;
|
wait_state = 13;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (!bs->disable_journal_fsync)
|
||||||
|
{
|
||||||
|
await_sqe(20);
|
||||||
|
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
|
||||||
|
data->iov = { 0 };
|
||||||
|
data->callback = simple_callback_w;
|
||||||
|
resume_21:
|
||||||
|
if (wait_count > 0)
|
||||||
|
{
|
||||||
|
wait_state = 21;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bs->journal.used_start = new_trim_pos;
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
|
||||||
|
#endif
|
||||||
|
flusher->trimming = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// All done
|
// All done
|
||||||
|
|
|
@ -59,6 +59,8 @@ class journal_flusher_co
|
||||||
uint64_t clean_bitmap_offset, clean_bitmap_len;
|
uint64_t clean_bitmap_offset, clean_bitmap_len;
|
||||||
void *new_clean_bitmap;
|
void *new_clean_bitmap;
|
||||||
|
|
||||||
|
uint64_t new_trim_pos;
|
||||||
|
|
||||||
// local: scan_dirty()
|
// local: scan_dirty()
|
||||||
uint64_t offset, end_offset, submit_offset, submit_len;
|
uint64_t offset, end_offset, submit_offset, submit_len;
|
||||||
|
|
||||||
|
@ -85,6 +87,7 @@ class journal_flusher_t
|
||||||
friend class journal_flusher_co;
|
friend class journal_flusher_co;
|
||||||
|
|
||||||
int journal_trim_counter, journal_trim_interval;
|
int journal_trim_counter, journal_trim_interval;
|
||||||
|
bool trimming;
|
||||||
void* journal_superblock;
|
void* journal_superblock;
|
||||||
|
|
||||||
int active_flushers;
|
int active_flushers;
|
||||||
|
|
|
@ -399,8 +399,6 @@ resume_1:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Trim journal on start so we don't stall when all entries are older
|
|
||||||
bs->journal.trim();
|
|
||||||
bs->journal.dirty_start = bs->journal.next_free;
|
bs->journal.dirty_start = bs->journal.next_free;
|
||||||
printf(
|
printf(
|
||||||
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
|
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
|
||||||
|
|
|
@ -184,7 +184,7 @@ journal_t::~journal_t()
|
||||||
buffer = NULL;
|
buffer = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool journal_t::trim()
|
uint64_t journal_t::get_trim_pos()
|
||||||
{
|
{
|
||||||
auto journal_used_it = used_sectors.lower_bound(used_start);
|
auto journal_used_it = used_sectors.lower_bound(used_start);
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
@ -202,26 +202,19 @@ bool journal_t::trim()
|
||||||
if (journal_used_it == used_sectors.end())
|
if (journal_used_it == used_sectors.end())
|
||||||
{
|
{
|
||||||
// Journal is empty
|
// Journal is empty
|
||||||
used_start = next_free;
|
return next_free;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
used_start = journal_used_it->first;
|
// next_free does not need updating during trim
|
||||||
// next_free does not need updating here
|
return journal_used_it->first;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (journal_used_it->first > used_start)
|
else if (journal_used_it->first > used_start)
|
||||||
{
|
{
|
||||||
// Journal is cleared up to <journal_used_it>
|
// Journal is cleared up to <journal_used_it>
|
||||||
used_start = journal_used_it->first;
|
return journal_used_it->first;
|
||||||
}
|
}
|
||||||
else
|
// Can't trim journal
|
||||||
{
|
return used_start;
|
||||||
// Can't trim journal
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
|
|
||||||
#endif
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -169,6 +169,7 @@ struct journal_t
|
||||||
|
|
||||||
~journal_t();
|
~journal_t();
|
||||||
bool trim();
|
bool trim();
|
||||||
|
uint64_t get_trim_pos();
|
||||||
};
|
};
|
||||||
|
|
||||||
struct blockstore_journal_check_t
|
struct blockstore_journal_check_t
|
||||||
|
|
|
@ -148,7 +148,6 @@ resume_5:
|
||||||
{
|
{
|
||||||
mark_rolled_back(*v);
|
mark_rolled_back(*v);
|
||||||
}
|
}
|
||||||
journal.trim();
|
|
||||||
// Acknowledge op
|
// Acknowledge op
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
|
|
Loading…
Reference in New Issue