Track used blocks, not object versions

hotfix-1.0.0
Vitaliy Filippov 2023-07-15 12:12:55 +03:00
parent 21b5124a4b
commit 1a4ceb420d
4 changed files with 31 additions and 67 deletions

View File

@ -160,17 +160,13 @@ void journal_flusher_t::remove_flush(object_id oid)
} }
} }
bool journal_flusher_t::is_flushed_over(obj_ver_id ov) bool journal_flusher_t::is_mutated(uint64_t clean_loc)
{ {
auto r_it = sync_to_repeat.find(ov.oid); for (int i = 0; i < cur_flusher_count; i++)
if (r_it != sync_to_repeat.end())
{ {
for (int i = 0; i < cur_flusher_count; i++) if (co[i].clean_loc == clean_loc && co[i].copy_count > 0)
{ {
if (co[i].wait_state != 0 && co[i].cur.oid == ov.oid) return true;
{
return co[i].cur.version > ov.version;
}
} }
} }
return false; return false;
@ -499,7 +495,7 @@ resume_2:
if (bs->dsk.csum_block_size) if (bs->dsk.csum_block_size)
{ {
// Mark objects used by reads as modified // Mark objects used by reads as modified
auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = clean_ver }); auto uo_it = bs->used_clean_objects.find(clean_loc);
if (uo_it != bs->used_clean_objects.end()) if (uo_it != bs->used_clean_objects.end())
{ {
uo_it->second.was_changed = true; uo_it->second.was_changed = true;
@ -1248,7 +1244,7 @@ void journal_flusher_co::free_data_blocks()
{ {
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc) if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{ {
auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = old_clean_ver }); auto uo_it = bs->used_clean_objects.find(old_clean_loc);
bool used = uo_it != bs->used_clean_objects.end(); bool used = uo_it != bs->used_clean_objects.end();
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf("%s block %lu from %lx:%lx v%lu (new location is %lu)\n", printf("%s block %lu from %lx:%lx v%lu (new location is %lu)\n",
@ -1258,25 +1254,25 @@ void journal_flusher_co::free_data_blocks()
clean_loc >> bs->dsk.block_order); clean_loc >> bs->dsk.block_order);
#endif #endif
if (used) if (used)
uo_it->second.freed_block = 1 + (old_clean_loc >> bs->dsk.block_order); uo_it->second.was_freed = true;
else else
bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false); bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false);
} }
if (has_delete) if (has_delete)
{ {
assert(clean_loc == old_clean_loc); assert(clean_loc == old_clean_loc);
auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = old_clean_ver }); auto uo_it = bs->used_clean_objects.find(old_clean_loc);
bool used = uo_it != bs->used_clean_objects.end(); bool used = uo_it != bs->used_clean_objects.end();
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf("%s block %lu from %lx:%lx v%lu (delete)\n", printf("%s block %lu from %lx:%lx v%lu (delete)\n",
used ? "Postpone free" : "Free", used ? "Postpone free" : "Free",
clean_loc >> bs->dsk.block_order, old_clean_loc >> bs->dsk.block_order,
cur.oid.inode, cur.oid.stripe, cur.version); cur.oid.inode, cur.oid.stripe, cur.version);
#endif #endif
if (used) if (used)
uo_it->second.freed_block = 1 + (clean_loc >> bs->dsk.block_order); uo_it->second.was_freed = true;
else else
bs->data_alloc->set(clean_loc >> bs->dsk.block_order, false); bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false);
} }
} }

View File

@ -136,5 +136,5 @@ public:
void unshift_flush(obj_ver_id oid, bool force); void unshift_flush(obj_ver_id oid, bool force);
void remove_flush(object_id oid); void remove_flush(object_id oid);
void dump_diagnostics(); void dump_diagnostics();
bool is_flushed_over(obj_ver_id ov); bool is_mutated(uint64_t clean_loc);
}; };

View File

@ -180,7 +180,7 @@ struct __attribute__((__packed__)) dirty_entry
struct used_clean_obj_t struct used_clean_obj_t
{ {
int refs; int refs;
uint64_t freed_block; // block+1 if freed, otherwise 0 bool was_freed; // was freed by a parallel flush?
bool was_changed; // was changed by a parallel flush? bool was_changed; // was changed by a parallel flush?
}; };
@ -206,7 +206,7 @@ struct blockstore_op_private_t
int op_state; int op_state;
// Read // Read
uint64_t clean_version_used; uint64_t clean_block_used;
std::vector<copy_buffer_t> read_vec; std::vector<copy_buffer_t> read_vec;
// Sync, write // Sync, write
@ -286,7 +286,7 @@ class blockstore_impl_t
bool alloc_dyn_data = false; bool alloc_dyn_data = false;
// clean data blocks referenced by read operations // clean data blocks referenced by read operations
std::map<obj_ver_id, used_clean_obj_t> used_clean_objects; std::map<uint64_t, used_clean_obj_t> used_clean_objects;
bool live = false, queue_stall = false; bool live = false, queue_stall = false;
ring_loop_t *ringloop; ring_loop_t *ringloop;

View File

@ -131,7 +131,7 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
// If we don't track it then we may IN THEORY read another object's data: // If we don't track it then we may IN THEORY read another object's data:
// submit read -> remove the object -> flush remove -> overwrite with another object -> finish read // submit read -> remove the object -> flush remove -> overwrite with another object -> finish read
// Very improbable, but possible // Very improbable, but possible
PRIV(read_op)->clean_version_used = 1; PRIV(read_op)->clean_block_used = 1;
} }
rv.insert(rv.begin() + pos, el); rv.insert(rv.begin() + pos, el);
fulfilled += el.len; fulfilled += el.len;
@ -371,14 +371,9 @@ bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uin
} }
if (!(vi->copy_flags & COPY_BUF_JOURNAL)) if (!(vi->copy_flags & COPY_BUF_JOURNAL))
{ {
// Reading may race with flushing. // Reads running parallel to flushes of the same clean block may read
// - Flushing happens in 3 steps: (2) punch holes in meta -> (4) update data -> (6) update meta // a mixture of old and new data. So we don't verify checksums for such blocks.
// - Reading may start/end at: 1/3, 1/5, 1/7, 3/5, 3/7, 5/7 PRIV(op)->clean_block_used = 1;
// - 1/3, 1/5, 3/5 are not a problem because we'll check data using punched bitmap and CRCs
// - For 1/7, 3/7 and 5/7 to finish correctly we need a copy of punched metadata
// otherwise the checksum may not match
// So flushers save a copy of punched metadata if the object is being read during (6).
PRIV(op)->clean_version_used = 1;
} }
return true; return true;
} }
@ -404,7 +399,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
} }
uint64_t fulfilled = 0; uint64_t fulfilled = 0;
PRIV(read_op)->pending_ops = 0; PRIV(read_op)->pending_ops = 0;
PRIV(read_op)->clean_version_used = 0; PRIV(read_op)->clean_block_used = 0;
auto & rv = PRIV(read_op)->read_vec; auto & rv = PRIV(read_op)->read_vec;
uint64_t result_version = 0; uint64_t result_version = 0;
if (dirty_found) if (dirty_found)
@ -617,7 +612,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
return false; return false;
} }
} }
PRIV(read_op)->clean_version_used = req > 0; PRIV(read_op)->clean_block_used = req > 0;
} }
else if (from_journal) else if (from_journal)
{ {
@ -682,14 +677,13 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
} }
} }
// Increment reference counter if clean data is being read from the disk // Increment reference counter if clean data is being read from the disk
if (PRIV(read_op)->clean_version_used) if (PRIV(read_op)->clean_block_used)
{ {
obj_ver_id ov = { .oid = read_op->oid, .version = clean_ver }; auto & uo = used_clean_objects[clean_loc];
auto & uo = used_clean_objects[ov];
uo.refs++; uo.refs++;
if (dsk.csum_block_size && flusher->is_flushed_over(ov)) if (dsk.csum_block_size && flusher->is_mutated(clean_loc))
uo.was_changed = true; uo.was_changed = true;
PRIV(read_op)->clean_version_used = ov.version; PRIV(read_op)->clean_block_used = clean_loc;
} }
return true; return true;
} }
@ -872,7 +866,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
{ {
// BIG_WRITE from journal or clean data // BIG_WRITE from journal or clean data
// Do not verify checksums if the data location is/was mutated by flushers // Do not verify checksums if the data location is/was mutated by flushers
auto & uo = used_clean_objects.at((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used }); auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
if (!uo.was_changed) if (!uo.was_changed)
{ {
verify_clean_padded_checksums( verify_clean_padded_checksums(
@ -948,46 +942,20 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
meta_block = NULL; meta_block = NULL;
} }
} }
if (PRIV(op)->clean_version_used) if (PRIV(op)->clean_block_used)
{ {
// Release clean data block // Release clean data block
obj_ver_id ov = { .oid = op->oid, .version = PRIV(op)->clean_version_used }; auto uo_it = used_clean_objects.find(PRIV(op)->clean_block_used);
auto uo_it = used_clean_objects.find(ov);
if (uo_it != used_clean_objects.end()) if (uo_it != used_clean_objects.end())
{ {
uo_it->second.refs--; uo_it->second.refs--;
if (uo_it->second.refs <= 0) if (uo_it->second.refs <= 0)
{ {
// Check to the left - even older usage entries may exist if (uo_it->second.was_freed)
bool still_used = false;
while (uo_it != used_clean_objects.begin())
{ {
uo_it--; data_alloc->set(PRIV(op)->clean_block_used, false);
if (uo_it->first.oid != op->oid)
{
uo_it++;
break;
}
if (uo_it->second.refs > 0)
{
still_used = true;
break;
}
}
// Free uo_it AND all following records with refs==0 too
if (!still_used)
{
while (uo_it != used_clean_objects.end() &&
uo_it->first.oid == op->oid &&
uo_it->second.refs == 0)
{
if (uo_it->second.freed_block > 0)
{
data_alloc->set(uo_it->second.freed_block-1, false);
}
used_clean_objects.erase(uo_it++);
}
} }
used_clean_objects.erase(uo_it);
} }
} }
} }