Put get_trim_pos into the "critical section". Fixes rare journal corruption issue

The consequence of this issue was that in some very rare cases (only reproduced
under load in CI when running 4+ tests in parallel) small write data written to
journal could overwrite journal entries.

Also add an assert-type safety check to be able to catch this issue in the
future again in case of a regression.
openssl
Vitaliy Filippov 2023-06-17 00:03:28 +03:00
parent bdd48e4cf1
commit 86b4682975
3 changed files with 73 additions and 46 deletions

View File

@ -675,53 +675,58 @@ resume_1:
return false;
}
flusher->trimming = true;
// First update journal "superblock" and only then update <used_start> in memory
await_sqe(12);
*((journal_entry_start*)flusher->journal_superblock) = {
.crc32 = 0,
.magic = JOURNAL_MAGIC,
.type = JE_START,
.size = sizeof(journal_entry_start),
.reserved = 0,
.journal_start = new_trim_pos,
.version = JOURNAL_VERSION,
};
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
data->callback = simple_callback_w;
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
wait_count++;
resume_13:
if (wait_count > 0)
// Recheck the position with the "lock" taken
new_trim_pos = bs->journal.get_trim_pos();
if (new_trim_pos != bs->journal.used_start)
{
wait_state = 13;
return false;
}
if (!bs->disable_journal_fsync)
{
await_sqe(20);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
// First update journal "superblock" and only then update <used_start> in memory
await_sqe(12);
*((journal_entry_start*)flusher->journal_superblock) = {
.crc32 = 0,
.magic = JOURNAL_MAGIC,
.type = JE_START,
.size = sizeof(journal_entry_start),
.reserved = 0,
.journal_start = new_trim_pos,
.version = JOURNAL_VERSION,
};
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
data->callback = simple_callback_w;
resume_21:
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
wait_count++;
resume_13:
if (wait_count > 0)
{
wait_state = 21;
wait_state = 13;
return false;
}
}
bs->journal.used_start = new_trim_pos;
if (!bs->disable_journal_fsync)
{
await_sqe(20);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = simple_callback_w;
resume_21:
if (wait_count > 0)
{
wait_state = 21;
return false;
}
}
bs->journal.used_start = new_trim_pos;
#ifdef BLOCKSTORE_DEBUG
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
#endif
if (bs->journal.flush_journal && !flusher->flush_queue.size())
{
assert(bs->journal.used_start == bs->journal.next_free);
printf("Journal flushed\n");
exit(0);
}
}
flusher->trimming = false;
}
if (bs->journal.flush_journal && !flusher->flush_queue.size())
{
assert(bs->journal.used_start == bs->journal.next_free);
printf("Journal flushed\n");
exit(0);
}
}
// All done
flusher->active_flushers--;

View File

@ -236,14 +236,6 @@ journal_t::~journal_t()
uint64_t journal_t::get_trim_pos()
{
auto journal_used_it = used_sectors.lower_bound(used_start);
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
used_start, next_free, dirty_start,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
);
#endif
if (journal_used_it == used_sectors.end())
{
// Journal is cleared to its end, restart from the beginning
@ -256,12 +248,26 @@ uint64_t journal_t::get_trim_pos()
else
{
// next_free does not need updating during trim
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
used_start, next_free, dirty_start,
journal_used_it->first, journal_used_it->second
);
#endif
return journal_used_it->first;
}
}
else if (journal_used_it->first > used_start)
{
// Journal is cleared up to <journal_used_it>
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
used_start, next_free, dirty_start,
journal_used_it->first, journal_used_it->second
);
#endif
return journal_used_it->first;
}
// Can't trim journal

View File

@ -409,7 +409,23 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
);
#endif
// Figure out where data will be
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size;
auto next_next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size;
if (op->len > 0)
{
auto journal_used_it = journal.used_sectors.lower_bound(next_next_free);
if (journal_used_it != journal.used_sectors.end() &&
journal_used_it->first < next_next_free + op->len)
{
printf(
"BUG: Attempt to overwrite used offset (%lx, %lu refs) of the journal with the object %lx:%lx v%lu: data at %lx, len %x!"
" Journal used_start=%08lx (%lu refs), next_free=%08lx, dirty_start=%08lx\n",
journal_used_it->first, journal_used_it->second, op->oid.inode, op->oid.stripe, op->version, next_next_free, op->len,
journal.used_start, journal.used_sectors[journal.used_start], journal.next_free, journal.dirty_start
);
exit(1);
}
}
journal.next_free = next_next_free;
je->oid = op->oid;
je->version = op->version;
je->offset = op->offset;