From 1a4ceb420df6050e0e231f7c3e7688c2d367ddfc Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 15 Jul 2023 12:12:55 +0300 Subject: [PATCH] Track used blocks, not object versions --- src/blockstore_flush.cpp | 26 +++++++--------- src/blockstore_flush.h | 2 +- src/blockstore_impl.h | 6 ++-- src/blockstore_read.cpp | 64 ++++++++++------------------------------ 4 files changed, 31 insertions(+), 67 deletions(-) diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index c939a562..57125eff 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -160,17 +160,13 @@ void journal_flusher_t::remove_flush(object_id oid) } } -bool journal_flusher_t::is_flushed_over(obj_ver_id ov) +bool journal_flusher_t::is_mutated(uint64_t clean_loc) { - auto r_it = sync_to_repeat.find(ov.oid); - if (r_it != sync_to_repeat.end()) + for (int i = 0; i < cur_flusher_count; i++) { - for (int i = 0; i < cur_flusher_count; i++) + if (co[i].clean_loc == clean_loc && co[i].copy_count > 0) { - if (co[i].wait_state != 0 && co[i].cur.oid == ov.oid) - { - return co[i].cur.version > ov.version; - } + return true; } } return false; @@ -499,7 +495,7 @@ resume_2: if (bs->dsk.csum_block_size) { // Mark objects used by reads as modified - auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = clean_ver }); + auto uo_it = bs->used_clean_objects.find(clean_loc); if (uo_it != bs->used_clean_objects.end()) { uo_it->second.was_changed = true; @@ -1248,7 +1244,7 @@ void journal_flusher_co::free_data_blocks() { if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc) { - auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = old_clean_ver }); + auto uo_it = bs->used_clean_objects.find(old_clean_loc); bool used = uo_it != bs->used_clean_objects.end(); #ifdef BLOCKSTORE_DEBUG printf("%s block %lu from %lx:%lx v%lu (new location is %lu)\n", @@ -1258,25 +1254,25 @@ void journal_flusher_co::free_data_blocks() clean_loc >> bs->dsk.block_order); #endif if (used) - uo_it->second.freed_block = 1 + (old_clean_loc >> bs->dsk.block_order); + uo_it->second.was_freed = true; else bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false); } if (has_delete) { assert(clean_loc == old_clean_loc); - auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = old_clean_ver }); + auto uo_it = bs->used_clean_objects.find(old_clean_loc); bool used = uo_it != bs->used_clean_objects.end(); #ifdef BLOCKSTORE_DEBUG printf("%s block %lu from %lx:%lx v%lu (delete)\n", used ? "Postpone free" : "Free", - clean_loc >> bs->dsk.block_order, + old_clean_loc >> bs->dsk.block_order, cur.oid.inode, cur.oid.stripe, cur.version); #endif if (used) - uo_it->second.freed_block = 1 + (clean_loc >> bs->dsk.block_order); + uo_it->second.was_freed = true; else - bs->data_alloc->set(clean_loc >> bs->dsk.block_order, false); + bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false); } } diff --git a/src/blockstore_flush.h b/src/blockstore_flush.h index 5cbfda4a..819c3326 100644 --- a/src/blockstore_flush.h +++ b/src/blockstore_flush.h @@ -136,5 +136,5 @@ public: void unshift_flush(obj_ver_id oid, bool force); void remove_flush(object_id oid); void dump_diagnostics(); - bool is_flushed_over(obj_ver_id ov); + bool is_mutated(uint64_t clean_loc); }; diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index db54bfdb..05b212d6 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -180,7 +180,7 @@ struct __attribute__((__packed__)) dirty_entry struct used_clean_obj_t { int refs; - uint64_t freed_block; // block+1 if freed, otherwise 0 + bool was_freed; // was freed by a parallel flush? bool was_changed; // was changed by a parallel flush? }; @@ -206,7 +206,7 @@ struct blockstore_op_private_t int op_state; // Read - uint64_t clean_version_used; + uint64_t clean_block_used; std::vector read_vec; // Sync, write @@ -286,7 +286,7 @@ class blockstore_impl_t bool alloc_dyn_data = false; // clean data blocks referenced by read operations - std::map used_clean_objects; + std::map used_clean_objects; bool live = false, queue_stall = false; ring_loop_t *ringloop; diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index cf57dd1b..73c11465 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -131,7 +131,7 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, // If we don't track it then we may IN THEORY read another object's data: // submit read -> remove the object -> flush remove -> overwrite with another object -> finish read // Very improbable, but possible - PRIV(read_op)->clean_version_used = 1; + PRIV(read_op)->clean_block_used = 1; } rv.insert(rv.begin() + pos, el); fulfilled += el.len; @@ -371,14 +371,9 @@ bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uin } if (!(vi->copy_flags & COPY_BUF_JOURNAL)) { - // Reading may race with flushing. - // - Flushing happens in 3 steps: (2) punch holes in meta -> (4) update data -> (6) update meta - // - Reading may start/end at: 1/3, 1/5, 1/7, 3/5, 3/7, 5/7 - // - 1/3, 1/5, 3/5 are not a problem because we'll check data using punched bitmap and CRCs - // - For 1/7, 3/7 and 5/7 to finish correctly we need a copy of punched metadata - // otherwise the checksum may not match - // So flushers save a copy of punched metadata if the object is being read during (6). - PRIV(op)->clean_version_used = 1; + // Reads running parallel to flushes of the same clean block may read + // a mixture of old and new data. So we don't verify checksums for such blocks. + PRIV(op)->clean_block_used = 1; } return true; } @@ -404,7 +399,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) } uint64_t fulfilled = 0; PRIV(read_op)->pending_ops = 0; - PRIV(read_op)->clean_version_used = 0; + PRIV(read_op)->clean_block_used = 0; auto & rv = PRIV(read_op)->read_vec; uint64_t result_version = 0; if (dirty_found) @@ -617,7 +612,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & return false; } } - PRIV(read_op)->clean_version_used = req > 0; + PRIV(read_op)->clean_block_used = req > 0; } else if (from_journal) { @@ -682,14 +677,13 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & } } // Increment reference counter if clean data is being read from the disk - if (PRIV(read_op)->clean_version_used) + if (PRIV(read_op)->clean_block_used) { - obj_ver_id ov = { .oid = read_op->oid, .version = clean_ver }; - auto & uo = used_clean_objects[ov]; + auto & uo = used_clean_objects[clean_loc]; uo.refs++; - if (dsk.csum_block_size && flusher->is_flushed_over(ov)) + if (dsk.csum_block_size && flusher->is_mutated(clean_loc)) uo.was_changed = true; - PRIV(read_op)->clean_version_used = ov.version; + PRIV(read_op)->clean_block_used = clean_loc; } return true; } @@ -872,7 +866,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op { // BIG_WRITE from journal or clean data // Do not verify checksums if the data location is/was mutated by flushers - auto & uo = used_clean_objects.at((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used }); + auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order); if (!uo.was_changed) { verify_clean_padded_checksums( @@ -948,46 +942,20 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op meta_block = NULL; } } - if (PRIV(op)->clean_version_used) + if (PRIV(op)->clean_block_used) { // Release clean data block - obj_ver_id ov = { .oid = op->oid, .version = PRIV(op)->clean_version_used }; - auto uo_it = used_clean_objects.find(ov); + auto uo_it = used_clean_objects.find(PRIV(op)->clean_block_used); if (uo_it != used_clean_objects.end()) { uo_it->second.refs--; if (uo_it->second.refs <= 0) { - // Check to the left - even older usage entries may exist - bool still_used = false; - while (uo_it != used_clean_objects.begin()) + if (uo_it->second.was_freed) { - uo_it--; - if (uo_it->first.oid != op->oid) - { - uo_it++; - break; - } - if (uo_it->second.refs > 0) - { - still_used = true; - break; - } - } - // Free uo_it AND all following records with refs==0 too - if (!still_used) - { - while (uo_it != used_clean_objects.end() && - uo_it->first.oid == op->oid && - uo_it->second.refs == 0) - { - if (uo_it->second.freed_block > 0) - { - data_alloc->set(uo_it->second.freed_block-1, false); - } - used_clean_objects.erase(uo_it++); - } + data_alloc->set(PRIV(op)->clean_block_used, false); } + used_clean_objects.erase(uo_it); } } }