From 4181add1f43a91af1d811af029489a0640819356 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 14 Jul 2023 00:02:59 +0300 Subject: [PATCH] Remove creepy "metadata copying" during overwrite Instead of it, just do not verify checksums of currently mutated objects. When clean data modification during flush runs in parallel to a read request, that request may read a mix of old and new data. It may even read a mix of multiple flushed versions if it lasts too long... And attempts to verify it using temporary copies of metadata make the algorithm too complex and creepy. --- src/blockstore_flush.cpp | 74 +++++++++++++++++---------------- src/blockstore_flush.h | 3 +- src/blockstore_impl.h | 5 ++- src/blockstore_read.cpp | 90 +++++++++++++++++----------------------- 4 files changed, 83 insertions(+), 89 deletions(-) diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index 45fe2bb8..c939a562 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -160,6 +160,22 @@ void journal_flusher_t::remove_flush(object_id oid) } } +bool journal_flusher_t::is_flushed_over(obj_ver_id ov) +{ + auto r_it = sync_to_repeat.find(ov.oid); + if (r_it != sync_to_repeat.end()) + { + for (int i = 0; i < cur_flusher_count; i++) + { + if (co[i].wait_state != 0 && co[i].cur.oid == ov.oid) + { + return co[i].cur.version > ov.version; + } + } + } + return false; +} + void journal_flusher_t::request_trim() { dequeuing = true; @@ -444,6 +460,7 @@ stop_flusher: else { clean_loc = old_clean_loc; + clean_ver = old_clean_ver; } } // Submit dirty data and old checksum data reads @@ -479,6 +496,15 @@ resume_2: wait_state = wait_base+13; return false; } + if (bs->dsk.csum_block_size) + { + // Mark objects used by reads as modified + auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = clean_ver }); + if (uo_it != bs->used_clean_objects.end()) + { + uo_it->second.was_changed = true; + } + } // Submit data writes for (it = v.begin(); it != v.end(); it++) { @@ -601,34 +627,6 @@ void journal_flusher_co::update_metadata_entry() } else { - if (bs->dsk.csum_block_size) - { - // Copy the whole old metadata entry before updating it if the updated object is being read - obj_ver_id ov = { .oid = new_entry->oid, .version = new_entry->version }; - auto uo_it = bs->used_clean_objects.upper_bound(ov); - if (uo_it != bs->used_clean_objects.begin()) - { - uo_it--; - if (uo_it->first.oid == new_entry->oid) - { - uint8_t *meta_copy = (uint8_t*)malloc_or_die(bs->dsk.clean_entry_size); - memcpy(meta_copy, new_entry, bs->dsk.clean_entry_size); - // The reads should free all metadata entry backups when they don't need them anymore - if (uo_it->first.version < new_entry->version) - { - // If ==, write in place - uo_it++; - bs->used_clean_objects.insert(uo_it, std::make_pair(ov, (used_clean_obj_t){ - .meta = meta_copy, - })); - } - else - { - uo_it->second.meta = meta_copy; - } - } - } - } // Set initial internal bitmap bits from the big write if (clean_init_bitmap) { @@ -725,6 +723,7 @@ bool journal_flusher_co::write_meta_block(flusher_meta_write_t & meta_block, int return true; } +// Punch holes in incomplete checksum blocks bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base) { if (wait_state == wait_base) goto resume_0; @@ -805,12 +804,6 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base) }); } } - // Actually clear bits - for (auto it = v.begin(); it != v.end(); it++) - { - if (it->copy_flags == COPY_BUF_JOURNAL || it->copy_flags == (COPY_BUF_JOURNAL|COPY_BUF_COALESCED)) - bitmap_clear(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity); - } { clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size); if (new_entry->oid != cur.oid) @@ -822,11 +815,20 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base) ); } assert(new_entry->oid == cur.oid); + // Actually clear bits + for (auto it = v.begin(); it != v.end(); it++) + { + if (it->copy_flags == COPY_BUF_JOURNAL || it->copy_flags == (COPY_BUF_JOURNAL|COPY_BUF_COALESCED)) + bitmap_clear(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity); + } // Calculate block checksums with new holes uint32_t *new_data_csums = (uint32_t*)(new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size); calc_block_checksums(new_data_csums, true); if (!bs->inmemory_meta) - memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_dyn_size); + { + auto inmem_bmp = (uint8_t*)bs->clean_bitmaps + (clean_loc >> bs->dsk.block_order)*2*bs->dsk.clean_entry_bitmap_size; + memcpy(inmem_bmp, new_clean_bitmap, 2*bs->dsk.clean_entry_bitmap_size); + } if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2) { // calculate metadata entry checksum @@ -914,6 +916,7 @@ void journal_flusher_co::scan_dirty() v.clear(); copy_count = 0; clean_loc = UINT64_MAX; + clean_ver = 0; has_delete = false; has_writes = false; skip_copy = false; @@ -989,6 +992,7 @@ void journal_flusher_co::scan_dirty() // There is an unflushed big write. Copy small writes in its position has_writes = true; clean_loc = dirty_it->second.location; + clean_ver = dirty_it->first.version; clean_init_bitmap = true; clean_bitmap_offset = dirty_it->second.offset; clean_bitmap_len = dirty_it->second.len; diff --git a/src/blockstore_flush.h b/src/blockstore_flush.h index 91ce36a6..5cbfda4a 100644 --- a/src/blockstore_flush.h +++ b/src/blockstore_flush.h @@ -67,7 +67,7 @@ class journal_flusher_co bool fill_incomplete, cleared_incomplete; int read_to_fill_incomplete; int copy_count; - uint64_t clean_loc, old_clean_loc, old_clean_ver; + uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver; flusher_meta_write_t meta_old, meta_new; bool clean_init_bitmap; uint64_t clean_bitmap_offset, clean_bitmap_len; @@ -136,4 +136,5 @@ public: void unshift_flush(obj_ver_id oid, bool force); void remove_flush(object_id oid); void dump_diagnostics(); + bool is_flushed_over(obj_ver_id ov); }; diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index 54387ee2..db54bfdb 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -181,7 +181,7 @@ struct used_clean_obj_t { int refs; uint64_t freed_block; // block+1 if freed, otherwise 0 - uint8_t *meta; // metadata copy + bool was_changed; // was changed by a parallel flush? }; // https://github.com/algorithm-ninja/cpp-btree @@ -352,7 +352,8 @@ class blockstore_impl_t iovec *iov, int n_iov, std::function bad_block_cb); bool verify_journal_checksums(uint8_t *csums, uint32_t offset, iovec *iov, int n_iov, std::function bad_block_cb); - bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov); + bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal, + iovec *iov, int n_iov, std::function bad_block_cb); int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len, uint32_t item_state, uint64_t item_version); void handle_read_event(ring_data_t *data, blockstore_op_t *op); diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index 9c6f246b..cf57dd1b 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -643,7 +643,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & else { bool csum_done = !dsk.csum_block_size || inmemory_meta; - uint8_t *csum_buf = clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size; + uint8_t *csum_buf = clean_entry_bitmap; uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity; while (bmp_start < bmp_size) { @@ -670,7 +670,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size()); csum_done = true; } - uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + bmp_start*(dsk.data_csum_type & 0xFF)); + uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF)); if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data)) @@ -685,7 +685,10 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & if (PRIV(read_op)->clean_version_used) { obj_ver_id ov = { .oid = read_op->oid, .version = clean_ver }; - used_clean_objects[ov].refs++; + auto & uo = used_clean_objects[ov]; + uo.refs++; + if (dsk.csum_block_size && flusher->is_flushed_over(ov)) + uo.was_changed = true; PRIV(read_op)->clean_version_used = ov.version; } return true; @@ -707,8 +710,8 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t PRIV(op)->pending_ops++; my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector); data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; - // return pointer to checksums - return buf + pos + sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size; + // return pointer to checksums + bitmap + return buf + pos + sizeof(clean_disk_entry); } bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset, @@ -803,21 +806,19 @@ bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset return true; } -bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov) +bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal, + iovec *iov, int n_iov, std::function bad_block_cb) { uint32_t offset = clean_loc % dsk.data_block_size; + if (from_journal) + return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb); clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order; - // First verify against the newest checksum version - uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0); - if (verify_padded_checksums(clean_entry_bitmap, csum_buf ? csum_buf : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size), offset, iov, n_iov, NULL)) - return true; - // Check through all relevant "metadata backups" possibly added by flushers - auto mb_it = used_clean_objects.lower_bound((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used }); - for (; mb_it != used_clean_objects.end() && mb_it->first.oid == op->oid; mb_it++) - if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta, - mb_it->second.meta + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, NULL)) - return true; - return false; + if (!dyn_data) + { + assert(inmemory_meta); + dyn_data = get_clean_entry_bitmap(clean_loc, 0); + } + return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb); } void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op) @@ -860,45 +861,37 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op { ok = false; printf( - "Checksum mismatch in object %lx:%lx v%lu in journal at block #%u: got %08x, expected %08x\n", + "Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n", op->oid.inode, op->oid.stripe, op->version, - bad_block / dsk.csum_block_size, calc_csum, stored_csum - ); - } - ); - } - else if (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG) - { - // BIG_WRITE from journal - verify_padded_checksums(rv[i].csum_buf, rv[i].csum_buf + dsk.clean_entry_bitmap_size, - rv[i].disk_offset % dsk.data_block_size, iov, n_iov, - [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum) - { - ok = false; - printf( - "Checksum mismatch in object %lx:%lx v%lu in RoW data at block #%u: got %08x, expected %08x\n", - op->oid.inode, op->oid.stripe, op->version, - bad_block / dsk.csum_block_size, calc_csum, stored_csum + rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum ); } ); } else { - // Clean data - ok = verify_clean_padded_checksums(op, rv[i].disk_offset, rv[i].csum_buf, iov, n_iov); + // BIG_WRITE from journal or clean data + // Do not verify checksums if the data location is/was mutated by flushers + auto & uo = used_clean_objects.at((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used }); + if (!uo.was_changed) + { + verify_clean_padded_checksums( + op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov, + [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum) + { + ok = false; + printf( + "Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n", + op->oid.inode, op->oid.stripe, op->version, + (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"), + rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum + ); + } + ); + } } if (!ok) { - uint64_t read_len = 0; - for (int i = 0; i < n_iov; i++) - read_len += iov[i].iov_len; - printf( - "Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx (read length 0x%lx)\n", - op->oid.inode, op->oid.stripe, op->version, - (rv[i].copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", - rv[i].disk_offset, read_len - ); op->retval = -EDOM; } free(rv[i].buf); @@ -992,11 +985,6 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op { data_alloc->set(uo_it->second.freed_block-1, false); } - if (uo_it->second.meta) - { - free(uo_it->second.meta); - uo_it->second.meta = NULL; - } used_clean_objects.erase(uo_it++); } }