Remove creepy "metadata copying" during overwrite

Instead of it, just do not verify checksums of currently mutated objects.
When clean data modification during flush runs in parallel to a read request,
that request may read a mix of old and new data. It may even read a mix of
multiple flushed versions if it lasts too long... And attempts to verify it
using temporary copies of metadata make the algorithm too complex and creepy.
hotfix-1.0.0
Vitaliy Filippov 2023-07-14 00:02:59 +03:00
parent a8464c19af
commit 4181add1f4
4 changed files with 83 additions and 89 deletions

View File

@ -160,6 +160,22 @@ void journal_flusher_t::remove_flush(object_id oid)
}
}
bool journal_flusher_t::is_flushed_over(obj_ver_id ov)
{
auto r_it = sync_to_repeat.find(ov.oid);
if (r_it != sync_to_repeat.end())
{
for (int i = 0; i < cur_flusher_count; i++)
{
if (co[i].wait_state != 0 && co[i].cur.oid == ov.oid)
{
return co[i].cur.version > ov.version;
}
}
}
return false;
}
void journal_flusher_t::request_trim()
{
dequeuing = true;
@ -444,6 +460,7 @@ stop_flusher:
else
{
clean_loc = old_clean_loc;
clean_ver = old_clean_ver;
}
}
// Submit dirty data and old checksum data reads
@ -479,6 +496,15 @@ resume_2:
wait_state = wait_base+13;
return false;
}
if (bs->dsk.csum_block_size)
{
// Mark objects used by reads as modified
auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = clean_ver });
if (uo_it != bs->used_clean_objects.end())
{
uo_it->second.was_changed = true;
}
}
// Submit data writes
for (it = v.begin(); it != v.end(); it++)
{
@ -601,34 +627,6 @@ void journal_flusher_co::update_metadata_entry()
}
else
{
if (bs->dsk.csum_block_size)
{
// Copy the whole old metadata entry before updating it if the updated object is being read
obj_ver_id ov = { .oid = new_entry->oid, .version = new_entry->version };
auto uo_it = bs->used_clean_objects.upper_bound(ov);
if (uo_it != bs->used_clean_objects.begin())
{
uo_it--;
if (uo_it->first.oid == new_entry->oid)
{
uint8_t *meta_copy = (uint8_t*)malloc_or_die(bs->dsk.clean_entry_size);
memcpy(meta_copy, new_entry, bs->dsk.clean_entry_size);
// The reads should free all metadata entry backups when they don't need them anymore
if (uo_it->first.version < new_entry->version)
{
// If ==, write in place
uo_it++;
bs->used_clean_objects.insert(uo_it, std::make_pair(ov, (used_clean_obj_t){
.meta = meta_copy,
}));
}
else
{
uo_it->second.meta = meta_copy;
}
}
}
}
// Set initial internal bitmap bits from the big write
if (clean_init_bitmap)
{
@ -725,6 +723,7 @@ bool journal_flusher_co::write_meta_block(flusher_meta_write_t & meta_block, int
return true;
}
// Punch holes in incomplete checksum blocks
bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
{
if (wait_state == wait_base) goto resume_0;
@ -805,12 +804,6 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
});
}
}
// Actually clear bits
for (auto it = v.begin(); it != v.end(); it++)
{
if (it->copy_flags == COPY_BUF_JOURNAL || it->copy_flags == (COPY_BUF_JOURNAL|COPY_BUF_COALESCED))
bitmap_clear(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity);
}
{
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
if (new_entry->oid != cur.oid)
@ -822,11 +815,20 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
);
}
assert(new_entry->oid == cur.oid);
// Actually clear bits
for (auto it = v.begin(); it != v.end(); it++)
{
if (it->copy_flags == COPY_BUF_JOURNAL || it->copy_flags == (COPY_BUF_JOURNAL|COPY_BUF_COALESCED))
bitmap_clear(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity);
}
// Calculate block checksums with new holes
uint32_t *new_data_csums = (uint32_t*)(new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size);
calc_block_checksums(new_data_csums, true);
if (!bs->inmemory_meta)
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_dyn_size);
{
auto inmem_bmp = (uint8_t*)bs->clean_bitmaps + (clean_loc >> bs->dsk.block_order)*2*bs->dsk.clean_entry_bitmap_size;
memcpy(inmem_bmp, new_clean_bitmap, 2*bs->dsk.clean_entry_bitmap_size);
}
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
{
// calculate metadata entry checksum
@ -914,6 +916,7 @@ void journal_flusher_co::scan_dirty()
v.clear();
copy_count = 0;
clean_loc = UINT64_MAX;
clean_ver = 0;
has_delete = false;
has_writes = false;
skip_copy = false;
@ -989,6 +992,7 @@ void journal_flusher_co::scan_dirty()
// There is an unflushed big write. Copy small writes in its position
has_writes = true;
clean_loc = dirty_it->second.location;
clean_ver = dirty_it->first.version;
clean_init_bitmap = true;
clean_bitmap_offset = dirty_it->second.offset;
clean_bitmap_len = dirty_it->second.len;

View File

@ -67,7 +67,7 @@ class journal_flusher_co
bool fill_incomplete, cleared_incomplete;
int read_to_fill_incomplete;
int copy_count;
uint64_t clean_loc, old_clean_loc, old_clean_ver;
uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
flusher_meta_write_t meta_old, meta_new;
bool clean_init_bitmap;
uint64_t clean_bitmap_offset, clean_bitmap_len;
@ -136,4 +136,5 @@ public:
void unshift_flush(obj_ver_id oid, bool force);
void remove_flush(object_id oid);
void dump_diagnostics();
bool is_flushed_over(obj_ver_id ov);
};

View File

@ -181,7 +181,7 @@ struct used_clean_obj_t
{
int refs;
uint64_t freed_block; // block+1 if freed, otherwise 0
uint8_t *meta; // metadata copy
bool was_changed; // was changed by a parallel flush?
};
// https://github.com/algorithm-ninja/cpp-btree
@ -352,7 +352,8 @@ class blockstore_impl_t
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov);
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
uint32_t item_state, uint64_t item_version);
void handle_read_event(ring_data_t *data, blockstore_op_t *op);

View File

@ -643,7 +643,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
else
{
bool csum_done = !dsk.csum_block_size || inmemory_meta;
uint8_t *csum_buf = clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size;
uint8_t *csum_buf = clean_entry_bitmap;
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
while (bmp_start < bmp_size)
{
@ -670,7 +670,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
csum_done = true;
}
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + bmp_start*(dsk.data_csum_type & 0xFF));
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
@ -685,7 +685,10 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
if (PRIV(read_op)->clean_version_used)
{
obj_ver_id ov = { .oid = read_op->oid, .version = clean_ver };
used_clean_objects[ov].refs++;
auto & uo = used_clean_objects[ov];
uo.refs++;
if (dsk.csum_block_size && flusher->is_flushed_over(ov))
uo.was_changed = true;
PRIV(read_op)->clean_version_used = ov.version;
}
return true;
@ -707,8 +710,8 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
PRIV(op)->pending_ops++;
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
// return pointer to checksums
return buf + pos + sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size;
// return pointer to checksums + bitmap
return buf + pos + sizeof(clean_disk_entry);
}
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
@ -803,21 +806,19 @@ bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset
return true;
}
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov)
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
{
uint32_t offset = clean_loc % dsk.data_block_size;
if (from_journal)
return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
// First verify against the newest checksum version
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
if (verify_padded_checksums(clean_entry_bitmap, csum_buf ? csum_buf : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size), offset, iov, n_iov, NULL))
return true;
// Check through all relevant "metadata backups" possibly added by flushers
auto mb_it = used_clean_objects.lower_bound((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used });
for (; mb_it != used_clean_objects.end() && mb_it->first.oid == op->oid; mb_it++)
if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta,
mb_it->second.meta + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, NULL))
return true;
return false;
if (!dyn_data)
{
assert(inmemory_meta);
dyn_data = get_clean_entry_bitmap(clean_loc, 0);
}
return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
}
void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
@ -860,45 +861,37 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
{
ok = false;
printf(
"Checksum mismatch in object %lx:%lx v%lu in journal at block #%u: got %08x, expected %08x\n",
"Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
op->oid.inode, op->oid.stripe, op->version,
bad_block / dsk.csum_block_size, calc_csum, stored_csum
);
}
);
}
else if (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG)
{
// BIG_WRITE from journal
verify_padded_checksums(rv[i].csum_buf, rv[i].csum_buf + dsk.clean_entry_bitmap_size,
rv[i].disk_offset % dsk.data_block_size, iov, n_iov,
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
{
ok = false;
printf(
"Checksum mismatch in object %lx:%lx v%lu in RoW data at block #%u: got %08x, expected %08x\n",
op->oid.inode, op->oid.stripe, op->version,
bad_block / dsk.csum_block_size, calc_csum, stored_csum
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
);
}
);
}
else
{
// Clean data
ok = verify_clean_padded_checksums(op, rv[i].disk_offset, rv[i].csum_buf, iov, n_iov);
// BIG_WRITE from journal or clean data
// Do not verify checksums if the data location is/was mutated by flushers
auto & uo = used_clean_objects.at((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used });
if (!uo.was_changed)
{
verify_clean_padded_checksums(
op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
{
ok = false;
printf(
"Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
op->oid.inode, op->oid.stripe, op->version,
(rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
);
}
);
}
}
if (!ok)
{
uint64_t read_len = 0;
for (int i = 0; i < n_iov; i++)
read_len += iov[i].iov_len;
printf(
"Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx (read length 0x%lx)\n",
op->oid.inode, op->oid.stripe, op->version,
(rv[i].copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data",
rv[i].disk_offset, read_len
);
op->retval = -EDOM;
}
free(rv[i].buf);
@ -992,11 +985,6 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
{
data_alloc->set(uo_it->second.freed_block-1, false);
}
if (uo_it->second.meta)
{
free(uo_it->second.meta);
uo_it->second.meta = NULL;
}
used_clean_objects.erase(uo_it++);
}
}