From 86b46829753df7832342a5c7052d5146352e3891 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 17 Jun 2023 00:03:28 +0300 Subject: [PATCH] Put get_trim_pos into the "critical section". Fixes rare journal corruption issue The consequence of this issue was that in some very rare cases (only reproduced under load in CI when running 4+ tests in parallel) small write data written to journal could overwrite journal entries. Also add an assert-type safety check to be able to catch this issue in the future again in case of a regression. --- src/blockstore_flush.cpp | 79 ++++++++++++++++++++------------------ src/blockstore_journal.cpp | 22 +++++++---- src/blockstore_write.cpp | 18 ++++++++- 3 files changed, 73 insertions(+), 46 deletions(-) diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index 53489c2e..7f72542a 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -675,53 +675,58 @@ resume_1: return false; } flusher->trimming = true; - // First update journal "superblock" and only then update in memory - await_sqe(12); - *((journal_entry_start*)flusher->journal_superblock) = { - .crc32 = 0, - .magic = JOURNAL_MAGIC, - .type = JE_START, - .size = sizeof(journal_entry_start), - .reserved = 0, - .journal_start = new_trim_pos, - .version = JOURNAL_VERSION, - }; - ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock); - data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size }; - data->callback = simple_callback_w; - my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset); - wait_count++; - resume_13: - if (wait_count > 0) + // Recheck the position with the "lock" taken + new_trim_pos = bs->journal.get_trim_pos(); + if (new_trim_pos != bs->journal.used_start) { - wait_state = 13; - return false; - } - if (!bs->disable_journal_fsync) - { - await_sqe(20); - my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC); - data->iov = { 0 }; + // First update journal "superblock" and only then update in memory + await_sqe(12); + *((journal_entry_start*)flusher->journal_superblock) = { + .crc32 = 0, + .magic = JOURNAL_MAGIC, + .type = JE_START, + .size = sizeof(journal_entry_start), + .reserved = 0, + .journal_start = new_trim_pos, + .version = JOURNAL_VERSION, + }; + ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock); + data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size }; data->callback = simple_callback_w; - resume_21: + my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset); + wait_count++; + resume_13: if (wait_count > 0) { - wait_state = 21; + wait_state = 13; return false; } - } - bs->journal.used_start = new_trim_pos; + if (!bs->disable_journal_fsync) + { + await_sqe(20); + my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC); + data->iov = { 0 }; + data->callback = simple_callback_w; + resume_21: + if (wait_count > 0) + { + wait_state = 21; + return false; + } + } + bs->journal.used_start = new_trim_pos; #ifdef BLOCKSTORE_DEBUG - printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free); + printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free); #endif + if (bs->journal.flush_journal && !flusher->flush_queue.size()) + { + assert(bs->journal.used_start == bs->journal.next_free); + printf("Journal flushed\n"); + exit(0); + } + } flusher->trimming = false; } - if (bs->journal.flush_journal && !flusher->flush_queue.size()) - { - assert(bs->journal.used_start == bs->journal.next_free); - printf("Journal flushed\n"); - exit(0); - } } // All done flusher->active_flushers--; diff --git a/src/blockstore_journal.cpp b/src/blockstore_journal.cpp index 37974308..bbb60cdf 100644 --- a/src/blockstore_journal.cpp +++ b/src/blockstore_journal.cpp @@ -236,14 +236,6 @@ journal_t::~journal_t() uint64_t journal_t::get_trim_pos() { auto journal_used_it = used_sectors.lower_bound(used_start); -#ifdef BLOCKSTORE_DEBUG - printf( - "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n", - used_start, next_free, dirty_start, - journal_used_it == used_sectors.end() ? 0 : journal_used_it->first, - journal_used_it == used_sectors.end() ? 0 : journal_used_it->second - ); -#endif if (journal_used_it == used_sectors.end()) { // Journal is cleared to its end, restart from the beginning @@ -256,12 +248,26 @@ uint64_t journal_t::get_trim_pos() else { // next_free does not need updating during trim +#ifdef BLOCKSTORE_DEBUG + printf( + "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n", + used_start, next_free, dirty_start, + journal_used_it->first, journal_used_it->second + ); +#endif return journal_used_it->first; } } else if (journal_used_it->first > used_start) { // Journal is cleared up to +#ifdef BLOCKSTORE_DEBUG + printf( + "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n", + used_start, next_free, dirty_start, + journal_used_it->first, journal_used_it->second + ); +#endif return journal_used_it->first; } // Can't trim journal diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 17a7805b..74ada503 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -409,7 +409,23 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) ); #endif // Figure out where data will be - journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size; + auto next_next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size; + if (op->len > 0) + { + auto journal_used_it = journal.used_sectors.lower_bound(next_next_free); + if (journal_used_it != journal.used_sectors.end() && + journal_used_it->first < next_next_free + op->len) + { + printf( + "BUG: Attempt to overwrite used offset (%lx, %lu refs) of the journal with the object %lx:%lx v%lu: data at %lx, len %x!" + " Journal used_start=%08lx (%lu refs), next_free=%08lx, dirty_start=%08lx\n", + journal_used_it->first, journal_used_it->second, op->oid.inode, op->oid.stripe, op->version, next_next_free, op->len, + journal.used_start, journal.used_sectors[journal.used_start], journal.next_free, journal.dirty_start + ); + exit(1); + } + } + journal.next_free = next_next_free; je->oid = op->oid; je->version = op->version; je->offset = op->offset;