From a8464c19afd40f6c5dff7c0950124b2fb1b7c8d1 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 13 Jul 2023 00:58:43 +0300 Subject: [PATCH] Support keeping checksums on disk (not in memory) Definitely beneficial for SSD+HDD setups --- src/blockstore_flush.cpp | 12 +++--- src/blockstore_flush.h | 2 + src/blockstore_impl.cpp | 4 +- src/blockstore_impl.h | 7 +-- src/blockstore_init.cpp | 2 +- src/blockstore_open.cpp | 7 ++- src/blockstore_read.cpp | 89 +++++++++++++++++++++++++++++++++------ src/osd_primary_write.cpp | 5 +++ 8 files changed, 99 insertions(+), 29 deletions(-) diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index 403f1106..45fe2bb8 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -666,7 +666,10 @@ void journal_flusher_co::update_metadata_entry() new_entry->oid = cur.oid; new_entry->version = cur.version; if (!bs->inmemory_meta) - memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_dyn_size); + { + auto inmem_bmp = (uint8_t*)bs->clean_bitmaps + (clean_loc >> bs->dsk.block_order)*2*bs->dsk.clean_entry_bitmap_size; + memcpy(inmem_bmp, new_clean_bitmap, 2*bs->dsk.clean_entry_bitmap_size); + } if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2) { // Calculate metadata entry checksum @@ -767,7 +770,8 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base) { assert(!(v[i].offset % bs->dsk.csum_block_size)); assert(!(v[i].len % bs->dsk.csum_block_size)); - bs->verify_padded_checksums(new_clean_bitmap, false, v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum) + bs->verify_padded_checksums(new_clean_bitmap, new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size, + v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum) { printf("Checksum mismatch in object %lx:%lx v%lu in data area at offset 0x%lx+0x%x: got %08x, expected %08x\n", cur.oid.inode, cur.oid.stripe, old_clean_ver, old_clean_loc, bad_block, calc_csum, stored_csum); @@ -1131,9 +1135,7 @@ bool journal_flusher_co::modify_meta_do_reads(int wait_base) resume_0: if (!modify_meta_read(clean_loc, meta_new, wait_base+0)) return false; - new_clean_bitmap = (bs->inmemory_meta - ? (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry) - : (uint8_t*)bs->clean_dyn_data + (clean_loc >> bs->dsk.block_order)*bs->dsk.clean_dyn_size); + new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry); if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc) { resume_1: diff --git a/src/blockstore_flush.h b/src/blockstore_flush.h index db73228c..91ce36a6 100644 --- a/src/blockstore_flush.h +++ b/src/blockstore_flush.h @@ -6,6 +6,8 @@ #define COPY_BUF_ZERO 4 #define COPY_BUF_CSUM_FILL 8 #define COPY_BUF_COALESCED 16 +#define COPY_BUF_META_BLOCK 32 +#define COPY_BUF_JOURNALED_BIG 64 struct copy_buffer_t { diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index 36da71af..578b5f2d 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -39,8 +39,8 @@ blockstore_impl_t::~blockstore_impl_t() dsk.close_all(); if (metadata_buffer) free(metadata_buffer); - if (clean_dyn_data) - free(clean_dyn_data); + if (clean_bitmaps) + free(clean_bitmaps); } bool blockstore_impl_t::is_started() diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index e50a9cc5..54387ee2 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -269,7 +269,7 @@ class blockstore_impl_t std::map clean_db_settings; std::map clean_db_shards; - uint8_t *clean_dyn_data = NULL; + uint8_t *clean_bitmaps = NULL; blockstore_dirty_db_t dirty_db; std::vector submit_queue; std::vector unsynced_big_writes, unsynced_small_writes; @@ -347,11 +347,12 @@ class blockstore_impl_t bool read_range_fulfilled(std::vector & rv, uint64_t & fulfilled, uint8_t *read_buf, uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end); bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc); - bool verify_padded_checksums(uint8_t *clean_entry_bitmap, bool is_journal, uint32_t offset, + uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos); + bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset, iovec *iov, int n_iov, std::function bad_block_cb); bool verify_journal_checksums(uint8_t *csums, uint32_t offset, iovec *iov, int n_iov, std::function bad_block_cb); - bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, iovec *iov, int n_iov); + bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov); int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len, uint32_t item_state, uint64_t item_version); void handle_read_event(ring_data_t *data, blockstore_op_t *op); diff --git a/src/blockstore_init.cpp b/src/blockstore_init.cpp index d90e3a51..be32b588 100644 --- a/src/blockstore_init.cpp +++ b/src/blockstore_init.cpp @@ -334,7 +334,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_ } if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size) { - memcpy(bs->clean_dyn_data + (done_cnt+i)*bs->dsk.clean_dyn_size, &entry->bitmap, bs->dsk.clean_dyn_size); + memcpy(bs->clean_bitmaps + (done_cnt+i) * 2 * bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2 * bs->dsk.clean_entry_bitmap_size); } auto & clean_db = bs->clean_db_shard(entry->oid); auto clean_it = clean_db.find(entry->oid); diff --git a/src/blockstore_open.cpp b/src/blockstore_open.cpp index 207bdd32..eaead712 100644 --- a/src/blockstore_open.cpp +++ b/src/blockstore_open.cpp @@ -139,13 +139,12 @@ void blockstore_impl_t::calc_lengths() } else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type) { - // FIXME: allow to store bitmap, but read checksums from the disk - clean_dyn_data = (uint8_t*)malloc(dsk.block_count * dsk.clean_dyn_size); - if (!clean_dyn_data) + clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size); + if (!clean_bitmaps) { throw std::runtime_error( "Failed to allocate memory for the metadata sparse write bitmap ("+ - std::to_string(dsk.block_count * dsk.clean_dyn_size / 1024 / 1024)+" MB)" + std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)" ); } } diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index 2eb675ef..9c6f246b 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -159,7 +159,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset); } else - clean_entry_bitmap = (uint8_t*)(clean_dyn_data + meta_loc*dsk.clean_dyn_size + offset); + clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset); return clean_entry_bitmap; } @@ -196,7 +196,7 @@ int blockstore_impl_t::fill_partial_checksum_blocks(std::vector & end_block++; // OK, mark this range as required rv.push_back((copy_buffer_t){ - .copy_flags = COPY_BUF_CSUM_FILL, + .copy_flags = COPY_BUF_CSUM_FILL | (from_journal ? COPY_BUF_JOURNALED_BIG : 0), .offset = start_block*dsk.csum_block_size, .len = (end_block-start_block)*dsk.csum_block_size, // save clean_entry_bitmap if we're reading clean data from the journal @@ -601,6 +601,15 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & auto & rv = PRIV(read_op)->read_vec; int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal, (uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len); + if (!inmemory_meta && !from_journal && req > 0) + { + // Read checksums from disk + uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req); + for (int i = req; i > 0; i--) + { + rv[rv.size()-i].csum_buf = csum_buf; + } + } for (int i = req; i > 0; i--) { if (!read_checksum_block(read_op, i, fulfilled, clean_loc)) @@ -633,6 +642,8 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & } else { + bool csum_done = !dsk.csum_block_size || inmemory_meta; + uint8_t *csum_buf = clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size; uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity; while (bmp_start < bmp_size) { @@ -653,8 +664,13 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & } if (bmp_end > bmp_start) { - uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size + - bmp_start*dsk.bitmap_granularity/dsk.csum_block_size*(dsk.data_csum_type & 0xFF)); + if (!csum_done) + { + // Read checksums from disk + csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size()); + csum_done = true; + } + uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + bmp_start*(dsk.data_csum_type & 0xFF)); if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data)) @@ -675,11 +691,31 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & return true; } -bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, bool is_journal, uint32_t offset, +uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos) +{ + auto & rv = PRIV(op)->read_vec; + auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size; + auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size; + uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size); + rv.insert(rv.begin()+rv_pos, (copy_buffer_t){ + .copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL, + .offset = pos, + .buf = buf, + }); + BS_SUBMIT_GET_SQE(sqe, data); + data->iov = (struct iovec){ buf, dsk.meta_block_size }; + PRIV(op)->pending_ops++; + my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector); + data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; + // return pointer to checksums + return buf + pos + sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size; +} + +bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset, iovec *iov, int n_iov, std::function bad_block_cb) { assert(!(offset % dsk.csum_block_size)); - uint32_t *csums = (uint32_t*)(clean_entry_bitmap + (is_journal ? 1 : 2)*dsk.clean_entry_bitmap_size); + uint32_t *csums = (uint32_t*)csum_buf; uint32_t block_csum = 0; uint32_t block_done = 0; uint32_t block_num = clean_entry_bitmap ? offset/dsk.csum_block_size : 0; @@ -767,18 +803,19 @@ bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset return true; } -bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, iovec *iov, int n_iov) +bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov) { uint32_t offset = clean_loc % dsk.data_block_size; clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order; // First verify against the newest checksum version uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0); - if (verify_padded_checksums(clean_entry_bitmap, false, offset, iov, n_iov, NULL)) + if (verify_padded_checksums(clean_entry_bitmap, csum_buf ? csum_buf : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size), offset, iov, n_iov, NULL)) return true; // Check through all relevant "metadata backups" possibly added by flushers auto mb_it = used_clean_objects.lower_bound((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used }); for (; mb_it != used_clean_objects.end() && mb_it->first.oid == op->oid; mb_it++) - if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta, false, offset, iov, n_iov, NULL)) + if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta, + mb_it->second.meta + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, NULL)) return true; return false; } @@ -798,13 +835,22 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op { // verify checksums if required auto & rv = PRIV(op)->read_vec; + void *meta_block = NULL; if (dsk.csum_block_size > dsk.bitmap_granularity) { - bool ok = true; for (int i = rv.size()-1; i >= 0 && (rv[i].copy_flags & COPY_BUF_CSUM_FILL); i--) { + if (rv[i].copy_flags & COPY_BUF_META_BLOCK) + { + // Metadata read. Skip + assert(!meta_block); + meta_block = rv[i].buf; + rv[i].buf = NULL; + continue; + } struct iovec *iov = (struct iovec*)((uint8_t*)rv[i].buf + (rv[i].len & 0xFFFFFFFF)); int n_iov = rv[i].len >> 32; + bool ok = true; if (rv[i].copy_flags & COPY_BUF_JOURNAL) { // SMALL_WRITE from journal @@ -821,10 +867,11 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op } ); } - else if (rv[i].csum_buf) + else if (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG) { // BIG_WRITE from journal - verify_padded_checksums(rv[i].csum_buf, true, rv[i].disk_offset % dsk.data_block_size, iov, n_iov, + verify_padded_checksums(rv[i].csum_buf, rv[i].csum_buf + dsk.clean_entry_bitmap_size, + rv[i].disk_offset % dsk.data_block_size, iov, n_iov, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum) { ok = false; @@ -839,7 +886,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op else { // Clean data - ok = verify_clean_padded_checksums(op, rv[i].disk_offset, iov, n_iov); + ok = verify_clean_padded_checksums(op, rv[i].disk_offset, rv[i].csum_buf, iov, n_iov); } if (!ok) { @@ -867,6 +914,14 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op { for (auto & vec: rv) { + if (vec.copy_flags & COPY_BUF_META_BLOCK) + { + // Metadata read. Skip + assert(!meta_block); + meta_block = vec.buf; + vec.buf = NULL; + continue; + } if (vec.csum_buf) { uint32_t *csum = (uint32_t*)vec.csum_buf; @@ -879,7 +934,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op "Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n", op->oid.inode, op->oid.stripe, op->version, (vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p, - crc32c(0, (uint8_t*)op->buf + vec.offset + p, dsk.csum_block_size), *csum + crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum ); op->retval = -EDOM; break; @@ -893,6 +948,12 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op } } } + if (meta_block) + { + // Free after checking + free(meta_block); + meta_block = NULL; + } } if (PRIV(op)->clean_version_used) { diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index 7b8ce74e..9da63aa3 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -145,6 +145,11 @@ resume_3: // Mark object corrupted and retry op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false); op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data(); + if (cur_op->rmw_buf) + { + free(cur_op->rmw_buf); + cur_op->rmw_buf = NULL; + } goto retry_1; } deref_object_state(pg, &op_data->object_state, true);