From 089f138e0cf9c0dc8d71c4b3150b3e24757b8d8b Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 3 Dec 2020 20:33:07 +0300 Subject: [PATCH] Allow situations where the journal contains a big_write(v1) after delete(v2) and v1 < v2 Fixes a crash in the following scenario: - client issues a delete request (object version is at least 2) - OSD has time to flush it to the metadata, but doesn't have time to move the journal start pointer on disk - client overwrites the same object and it gets the version number 1 again - OSD is restarted and sees delete(v=2), big_write(v=1) in the journal - dirty_db sequence gets broken and OSD crashes with assert("Writes and deletes shouldn't happen at the same time") --- blockstore_init.cpp | 39 +++++++++++++++++++++++++++++++++++++++ blockstore_write.cpp | 5 ++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/blockstore_init.cpp b/blockstore_init.cpp index d73833e16..cf80080f6 100644 --- a/blockstore_init.cpp +++ b/blockstore_init.cpp @@ -562,6 +562,45 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location ); #endif + auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){ + .oid = je->big_write.oid, + .version = UINT64_MAX, + }); + if (dirty_it != bs->dirty_db.begin() && bs->dirty_db.size() > 0) + { + dirty_it--; + if (dirty_it->first.oid == je->big_write.oid && + (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE) + { + // It is allowed to overwrite a deleted object with a + // version number less than deletion version number, + // because the presence of a BIG_WRITE entry means that + // the data for it is already on disk. + // Purge all dirty and clean entries for this object. + auto dirty_end = dirty_it; + dirty_end++; + while (1) + { + if (dirty_it == bs->dirty_db.begin()) + { + break; + } + dirty_it--; + if (dirty_it->first.oid != je->big_write.oid) + { + dirty_it++; + break; + } + } + bs->erase_dirty(dirty_it, dirty_end, UINT64_MAX); + auto clean_it = bs->clean_db.find(je->big_write.oid); + if (clean_it != bs->clean_db.end()) + { + bs->data_alloc->set(clean_it->second.location >> bs->block_order, false); + bs->clean_db.erase(clean_it); + } + } + } auto clean_it = bs->clean_db.find(je->big_write.oid); if (clean_it == bs->clean_db.end() || clean_it->second.version < je->big_write.version) diff --git a/blockstore_write.cpp b/blockstore_write.cpp index 3feff3504..53a3322c8 100644 --- a/blockstore_write.cpp +++ b/blockstore_write.cpp @@ -492,7 +492,10 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op) prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb); PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->pending_ops++; - // Remember small write as unsynced + } + else + { + // Remember delete as unsynced unsynced_small_writes.push_back((obj_ver_id){ .oid = op->oid, .version = op->version,