Remove creepy "metadata copying" during overwrite
Instead of it, just do not verify checksums of currently mutated objects. When clean data modification during flush runs in parallel to a read request, that request may read a mix of old and new data. It may even read a mix of multiple flushed versions if it lasts too long... And attempts to verify it using temporary copies of metadata make the algorithm too complex and creepy.cached-reads
parent
8e099c1d11
commit
708918a4c7
|
@ -160,6 +160,22 @@ void journal_flusher_t::remove_flush(object_id oid)
|
|||
}
|
||||
}
|
||||
|
||||
bool journal_flusher_t::is_flushed_over(obj_ver_id ov)
|
||||
{
|
||||
auto r_it = sync_to_repeat.find(ov.oid);
|
||||
if (r_it != sync_to_repeat.end())
|
||||
{
|
||||
for (int i = 0; i < cur_flusher_count; i++)
|
||||
{
|
||||
if (co[i].wait_state != 0 && co[i].cur.oid == ov.oid)
|
||||
{
|
||||
return co[i].cur.version > ov.version;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void journal_flusher_t::request_trim()
|
||||
{
|
||||
dequeuing = true;
|
||||
|
@ -444,6 +460,7 @@ stop_flusher:
|
|||
else
|
||||
{
|
||||
clean_loc = old_clean_loc;
|
||||
clean_ver = old_clean_ver;
|
||||
}
|
||||
}
|
||||
// Submit dirty data and old checksum data reads
|
||||
|
@ -479,6 +496,15 @@ resume_2:
|
|||
wait_state = wait_base+13;
|
||||
return false;
|
||||
}
|
||||
if (bs->dsk.csum_block_size)
|
||||
{
|
||||
// Mark objects used by reads as modified
|
||||
auto uo_it = bs->used_clean_objects.find((obj_ver_id){ .oid = cur.oid, .version = clean_ver });
|
||||
if (uo_it != bs->used_clean_objects.end())
|
||||
{
|
||||
uo_it->second.was_changed = true;
|
||||
}
|
||||
}
|
||||
// Submit data writes
|
||||
for (it = v.begin(); it != v.end(); it++)
|
||||
{
|
||||
|
@ -601,34 +627,6 @@ void journal_flusher_co::update_metadata_entry()
|
|||
}
|
||||
else
|
||||
{
|
||||
if (bs->dsk.csum_block_size)
|
||||
{
|
||||
// Copy the whole old metadata entry before updating it if the updated object is being read
|
||||
obj_ver_id ov = { .oid = new_entry->oid, .version = new_entry->version };
|
||||
auto uo_it = bs->used_clean_objects.upper_bound(ov);
|
||||
if (uo_it != bs->used_clean_objects.begin())
|
||||
{
|
||||
uo_it--;
|
||||
if (uo_it->first.oid == new_entry->oid)
|
||||
{
|
||||
uint8_t *meta_copy = (uint8_t*)malloc_or_die(bs->dsk.clean_entry_size);
|
||||
memcpy(meta_copy, new_entry, bs->dsk.clean_entry_size);
|
||||
// The reads should free all metadata entry backups when they don't need them anymore
|
||||
if (uo_it->first.version < new_entry->version)
|
||||
{
|
||||
// If ==, write in place
|
||||
uo_it++;
|
||||
bs->used_clean_objects.insert(uo_it, std::make_pair(ov, (used_clean_obj_t){
|
||||
.meta = meta_copy,
|
||||
}));
|
||||
}
|
||||
else
|
||||
{
|
||||
uo_it->second.meta = meta_copy;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Set initial internal bitmap bits from the big write
|
||||
if (clean_init_bitmap)
|
||||
{
|
||||
|
@ -725,6 +723,7 @@ bool journal_flusher_co::write_meta_block(flusher_meta_write_t & meta_block, int
|
|||
return true;
|
||||
}
|
||||
|
||||
// Punch holes in incomplete checksum blocks
|
||||
bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||
{
|
||||
if (wait_state == wait_base) goto resume_0;
|
||||
|
@ -805,12 +804,6 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
|||
});
|
||||
}
|
||||
}
|
||||
// Actually clear bits
|
||||
for (auto it = v.begin(); it != v.end(); it++)
|
||||
{
|
||||
if (it->copy_flags == COPY_BUF_JOURNAL || it->copy_flags == (COPY_BUF_JOURNAL|COPY_BUF_COALESCED))
|
||||
bitmap_clear(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity);
|
||||
}
|
||||
{
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
|
||||
if (new_entry->oid != cur.oid)
|
||||
|
@ -822,11 +815,20 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
|||
);
|
||||
}
|
||||
assert(new_entry->oid == cur.oid);
|
||||
// Actually clear bits
|
||||
for (auto it = v.begin(); it != v.end(); it++)
|
||||
{
|
||||
if (it->copy_flags == COPY_BUF_JOURNAL || it->copy_flags == (COPY_BUF_JOURNAL|COPY_BUF_COALESCED))
|
||||
bitmap_clear(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity);
|
||||
}
|
||||
// Calculate block checksums with new holes
|
||||
uint32_t *new_data_csums = (uint32_t*)(new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size);
|
||||
calc_block_checksums(new_data_csums, true);
|
||||
if (!bs->inmemory_meta)
|
||||
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_dyn_size);
|
||||
{
|
||||
auto inmem_bmp = (uint8_t*)bs->clean_bitmaps + (clean_loc >> bs->dsk.block_order)*2*bs->dsk.clean_entry_bitmap_size;
|
||||
memcpy(inmem_bmp, new_clean_bitmap, 2*bs->dsk.clean_entry_bitmap_size);
|
||||
}
|
||||
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
// calculate metadata entry checksum
|
||||
|
@ -914,6 +916,7 @@ void journal_flusher_co::scan_dirty()
|
|||
v.clear();
|
||||
copy_count = 0;
|
||||
clean_loc = UINT64_MAX;
|
||||
clean_ver = 0;
|
||||
has_delete = false;
|
||||
has_writes = false;
|
||||
skip_copy = false;
|
||||
|
@ -989,6 +992,7 @@ void journal_flusher_co::scan_dirty()
|
|||
// There is an unflushed big write. Copy small writes in its position
|
||||
has_writes = true;
|
||||
clean_loc = dirty_it->second.location;
|
||||
clean_ver = dirty_it->first.version;
|
||||
clean_init_bitmap = true;
|
||||
clean_bitmap_offset = dirty_it->second.offset;
|
||||
clean_bitmap_len = dirty_it->second.len;
|
||||
|
|
|
@ -67,7 +67,7 @@ class journal_flusher_co
|
|||
bool fill_incomplete, cleared_incomplete;
|
||||
int read_to_fill_incomplete;
|
||||
int copy_count;
|
||||
uint64_t clean_loc, old_clean_loc, old_clean_ver;
|
||||
uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
|
||||
flusher_meta_write_t meta_old, meta_new;
|
||||
bool clean_init_bitmap;
|
||||
uint64_t clean_bitmap_offset, clean_bitmap_len;
|
||||
|
@ -136,4 +136,5 @@ public:
|
|||
void unshift_flush(obj_ver_id oid, bool force);
|
||||
void remove_flush(object_id oid);
|
||||
void dump_diagnostics();
|
||||
bool is_flushed_over(obj_ver_id ov);
|
||||
};
|
||||
|
|
|
@ -181,7 +181,7 @@ struct used_clean_obj_t
|
|||
{
|
||||
int refs;
|
||||
uint64_t freed_block; // block+1 if freed, otherwise 0
|
||||
uint8_t *meta; // metadata copy
|
||||
bool was_changed; // was changed by a parallel flush?
|
||||
};
|
||||
|
||||
// https://github.com/algorithm-ninja/cpp-btree
|
||||
|
@ -352,7 +352,8 @@ class blockstore_impl_t
|
|||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov);
|
||||
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
|
||||
uint32_t item_state, uint64_t item_version);
|
||||
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
|
||||
|
|
|
@ -643,7 +643,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
|||
else
|
||||
{
|
||||
bool csum_done = !dsk.csum_block_size || inmemory_meta;
|
||||
uint8_t *csum_buf = clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size;
|
||||
uint8_t *csum_buf = clean_entry_bitmap;
|
||||
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
|
||||
while (bmp_start < bmp_size)
|
||||
{
|
||||
|
@ -670,7 +670,7 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
|||
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
|
||||
csum_done = true;
|
||||
}
|
||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + bmp_start*(dsk.data_csum_type & 0xFF));
|
||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
|
||||
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
|
||||
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
|
||||
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
|
||||
|
@ -685,7 +685,10 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
|||
if (PRIV(read_op)->clean_version_used)
|
||||
{
|
||||
obj_ver_id ov = { .oid = read_op->oid, .version = clean_ver };
|
||||
used_clean_objects[ov].refs++;
|
||||
auto & uo = used_clean_objects[ov];
|
||||
uo.refs++;
|
||||
if (dsk.csum_block_size && flusher->is_flushed_over(ov))
|
||||
uo.was_changed = true;
|
||||
PRIV(read_op)->clean_version_used = ov.version;
|
||||
}
|
||||
return true;
|
||||
|
@ -707,8 +710,8 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
|
|||
PRIV(op)->pending_ops++;
|
||||
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
|
||||
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
||||
// return pointer to checksums
|
||||
return buf + pos + sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size;
|
||||
// return pointer to checksums + bitmap
|
||||
return buf + pos + sizeof(clean_disk_entry);
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
||||
|
@ -803,21 +806,19 @@ bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset
|
|||
return true;
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov)
|
||||
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
|
||||
{
|
||||
uint32_t offset = clean_loc % dsk.data_block_size;
|
||||
if (from_journal)
|
||||
return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
||||
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
|
||||
// First verify against the newest checksum version
|
||||
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
|
||||
if (verify_padded_checksums(clean_entry_bitmap, csum_buf ? csum_buf : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size), offset, iov, n_iov, NULL))
|
||||
return true;
|
||||
// Check through all relevant "metadata backups" possibly added by flushers
|
||||
auto mb_it = used_clean_objects.lower_bound((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used });
|
||||
for (; mb_it != used_clean_objects.end() && mb_it->first.oid == op->oid; mb_it++)
|
||||
if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta,
|
||||
mb_it->second.meta + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, NULL))
|
||||
return true;
|
||||
return false;
|
||||
if (!dyn_data)
|
||||
{
|
||||
assert(inmemory_meta);
|
||||
dyn_data = get_clean_entry_bitmap(clean_loc, 0);
|
||||
}
|
||||
return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
|
||||
|
@ -860,45 +861,37 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
{
|
||||
ok = false;
|
||||
printf(
|
||||
"Checksum mismatch in object %lx:%lx v%lu in journal at block #%u: got %08x, expected %08x\n",
|
||||
"Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
else if (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG)
|
||||
{
|
||||
// BIG_WRITE from journal
|
||||
verify_padded_checksums(rv[i].csum_buf, rv[i].csum_buf + dsk.clean_entry_bitmap_size,
|
||||
rv[i].disk_offset % dsk.data_block_size, iov, n_iov,
|
||||
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
{
|
||||
ok = false;
|
||||
printf(
|
||||
"Checksum mismatch in object %lx:%lx v%lu in RoW data at block #%u: got %08x, expected %08x\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Clean data
|
||||
ok = verify_clean_padded_checksums(op, rv[i].disk_offset, rv[i].csum_buf, iov, n_iov);
|
||||
// BIG_WRITE from journal or clean data
|
||||
// Do not verify checksums if the data location is/was mutated by flushers
|
||||
auto & uo = used_clean_objects.at((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used });
|
||||
if (!uo.was_changed)
|
||||
{
|
||||
verify_clean_padded_checksums(
|
||||
op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
|
||||
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
{
|
||||
ok = false;
|
||||
printf(
|
||||
"Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
(rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
|
||||
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
if (!ok)
|
||||
{
|
||||
uint64_t read_len = 0;
|
||||
for (int i = 0; i < n_iov; i++)
|
||||
read_len += iov[i].iov_len;
|
||||
printf(
|
||||
"Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx (read length 0x%lx)\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
(rv[i].copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data",
|
||||
rv[i].disk_offset, read_len
|
||||
);
|
||||
op->retval = -EDOM;
|
||||
}
|
||||
free(rv[i].buf);
|
||||
|
@ -992,11 +985,6 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
{
|
||||
data_alloc->set(uo_it->second.freed_block-1, false);
|
||||
}
|
||||
if (uo_it->second.meta)
|
||||
{
|
||||
free(uo_it->second.meta);
|
||||
uo_it->second.meta = NULL;
|
||||
}
|
||||
used_clean_objects.erase(uo_it++);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue