Support keeping checksums on disk (not in memory)

Definitely beneficial for SSD+HDD setups
hotfix-1.0.0
Vitaliy Filippov 2023-07-13 00:58:43 +03:00
parent 819cb70cdd
commit a8464c19af
8 changed files with 99 additions and 29 deletions

View File

@ -666,7 +666,10 @@ void journal_flusher_co::update_metadata_entry()
new_entry->oid = cur.oid;
new_entry->version = cur.version;
if (!bs->inmemory_meta)
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_dyn_size);
{
auto inmem_bmp = (uint8_t*)bs->clean_bitmaps + (clean_loc >> bs->dsk.block_order)*2*bs->dsk.clean_entry_bitmap_size;
memcpy(inmem_bmp, new_clean_bitmap, 2*bs->dsk.clean_entry_bitmap_size);
}
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
{
// Calculate metadata entry checksum
@ -767,7 +770,8 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
{
assert(!(v[i].offset % bs->dsk.csum_block_size));
assert(!(v[i].len % bs->dsk.csum_block_size));
bs->verify_padded_checksums(new_clean_bitmap, false, v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
bs->verify_padded_checksums(new_clean_bitmap, new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size,
v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
{
printf("Checksum mismatch in object %lx:%lx v%lu in data area at offset 0x%lx+0x%x: got %08x, expected %08x\n",
cur.oid.inode, cur.oid.stripe, old_clean_ver, old_clean_loc, bad_block, calc_csum, stored_csum);
@ -1131,9 +1135,7 @@ bool journal_flusher_co::modify_meta_do_reads(int wait_base)
resume_0:
if (!modify_meta_read(clean_loc, meta_new, wait_base+0))
return false;
new_clean_bitmap = (bs->inmemory_meta
? (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry)
: (uint8_t*)bs->clean_dyn_data + (clean_loc >> bs->dsk.block_order)*bs->dsk.clean_dyn_size);
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry);
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{
resume_1:

View File

@ -6,6 +6,8 @@
#define COPY_BUF_ZERO 4
#define COPY_BUF_CSUM_FILL 8
#define COPY_BUF_COALESCED 16
#define COPY_BUF_META_BLOCK 32
#define COPY_BUF_JOURNALED_BIG 64
struct copy_buffer_t
{

View File

@ -39,8 +39,8 @@ blockstore_impl_t::~blockstore_impl_t()
dsk.close_all();
if (metadata_buffer)
free(metadata_buffer);
if (clean_dyn_data)
free(clean_dyn_data);
if (clean_bitmaps)
free(clean_bitmaps);
}
bool blockstore_impl_t::is_started()

View File

@ -269,7 +269,7 @@ class blockstore_impl_t
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
uint8_t *clean_dyn_data = NULL;
uint8_t *clean_bitmaps = NULL;
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
@ -347,11 +347,12 @@ class blockstore_impl_t
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, bool is_journal, uint32_t offset,
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, iovec *iov, int n_iov);
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov);
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
uint32_t item_state, uint64_t item_version);
void handle_read_event(ring_data_t *data, blockstore_op_t *op);

View File

@ -334,7 +334,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
}
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
{
memcpy(bs->clean_dyn_data + (done_cnt+i)*bs->dsk.clean_dyn_size, &entry->bitmap, bs->dsk.clean_dyn_size);
memcpy(bs->clean_bitmaps + (done_cnt+i) * 2 * bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2 * bs->dsk.clean_entry_bitmap_size);
}
auto & clean_db = bs->clean_db_shard(entry->oid);
auto clean_it = clean_db.find(entry->oid);

View File

@ -139,13 +139,12 @@ void blockstore_impl_t::calc_lengths()
}
else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
{
// FIXME: allow to store bitmap, but read checksums from the disk
clean_dyn_data = (uint8_t*)malloc(dsk.block_count * dsk.clean_dyn_size);
if (!clean_dyn_data)
clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
if (!clean_bitmaps)
{
throw std::runtime_error(
"Failed to allocate memory for the metadata sparse write bitmap ("+
std::to_string(dsk.block_count * dsk.clean_dyn_size / 1024 / 1024)+" MB)"
std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
);
}
}

View File

@ -159,7 +159,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
}
else
clean_entry_bitmap = (uint8_t*)(clean_dyn_data + meta_loc*dsk.clean_dyn_size + offset);
clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
return clean_entry_bitmap;
}
@ -196,7 +196,7 @@ int blockstore_impl_t::fill_partial_checksum_blocks(std::vector<copy_buffer_t> &
end_block++;
// OK, mark this range as required
rv.push_back((copy_buffer_t){
.copy_flags = COPY_BUF_CSUM_FILL,
.copy_flags = COPY_BUF_CSUM_FILL | (from_journal ? COPY_BUF_JOURNALED_BIG : 0),
.offset = start_block*dsk.csum_block_size,
.len = (end_block-start_block)*dsk.csum_block_size,
// save clean_entry_bitmap if we're reading clean data from the journal
@ -601,6 +601,15 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
auto & rv = PRIV(read_op)->read_vec;
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
if (!inmemory_meta && !from_journal && req > 0)
{
// Read checksums from disk
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
for (int i = req; i > 0; i--)
{
rv[rv.size()-i].csum_buf = csum_buf;
}
}
for (int i = req; i > 0; i--)
{
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
@ -633,6 +642,8 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
}
else
{
bool csum_done = !dsk.csum_block_size || inmemory_meta;
uint8_t *csum_buf = clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size;
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
while (bmp_start < bmp_size)
{
@ -653,8 +664,13 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
}
if (bmp_end > bmp_start)
{
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size +
bmp_start*dsk.bitmap_granularity/dsk.csum_block_size*(dsk.data_csum_type & 0xFF));
if (!csum_done)
{
// Read checksums from disk
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
csum_done = true;
}
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + bmp_start*(dsk.data_csum_type & 0xFF));
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
@ -675,11 +691,31 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
return true;
}
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, bool is_journal, uint32_t offset,
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
{
auto & rv = PRIV(op)->read_vec;
auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
.copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
.offset = pos,
.buf = buf,
});
BS_SUBMIT_GET_SQE(sqe, data);
data->iov = (struct iovec){ buf, dsk.meta_block_size };
PRIV(op)->pending_ops++;
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
// return pointer to checksums
return buf + pos + sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size;
}
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
{
assert(!(offset % dsk.csum_block_size));
uint32_t *csums = (uint32_t*)(clean_entry_bitmap + (is_journal ? 1 : 2)*dsk.clean_entry_bitmap_size);
uint32_t *csums = (uint32_t*)csum_buf;
uint32_t block_csum = 0;
uint32_t block_done = 0;
uint32_t block_num = clean_entry_bitmap ? offset/dsk.csum_block_size : 0;
@ -767,18 +803,19 @@ bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset
return true;
}
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, iovec *iov, int n_iov)
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov)
{
uint32_t offset = clean_loc % dsk.data_block_size;
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
// First verify against the newest checksum version
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
if (verify_padded_checksums(clean_entry_bitmap, false, offset, iov, n_iov, NULL))
if (verify_padded_checksums(clean_entry_bitmap, csum_buf ? csum_buf : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size), offset, iov, n_iov, NULL))
return true;
// Check through all relevant "metadata backups" possibly added by flushers
auto mb_it = used_clean_objects.lower_bound((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used });
for (; mb_it != used_clean_objects.end() && mb_it->first.oid == op->oid; mb_it++)
if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta, false, offset, iov, n_iov, NULL))
if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta,
mb_it->second.meta + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, NULL))
return true;
return false;
}
@ -798,13 +835,22 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
{
// verify checksums if required
auto & rv = PRIV(op)->read_vec;
void *meta_block = NULL;
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
bool ok = true;
for (int i = rv.size()-1; i >= 0 && (rv[i].copy_flags & COPY_BUF_CSUM_FILL); i--)
{
if (rv[i].copy_flags & COPY_BUF_META_BLOCK)
{
// Metadata read. Skip
assert(!meta_block);
meta_block = rv[i].buf;
rv[i].buf = NULL;
continue;
}
struct iovec *iov = (struct iovec*)((uint8_t*)rv[i].buf + (rv[i].len & 0xFFFFFFFF));
int n_iov = rv[i].len >> 32;
bool ok = true;
if (rv[i].copy_flags & COPY_BUF_JOURNAL)
{
// SMALL_WRITE from journal
@ -821,10 +867,11 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
}
);
}
else if (rv[i].csum_buf)
else if (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG)
{
// BIG_WRITE from journal
verify_padded_checksums(rv[i].csum_buf, true, rv[i].disk_offset % dsk.data_block_size, iov, n_iov,
verify_padded_checksums(rv[i].csum_buf, rv[i].csum_buf + dsk.clean_entry_bitmap_size,
rv[i].disk_offset % dsk.data_block_size, iov, n_iov,
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
{
ok = false;
@ -839,7 +886,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
else
{
// Clean data
ok = verify_clean_padded_checksums(op, rv[i].disk_offset, iov, n_iov);
ok = verify_clean_padded_checksums(op, rv[i].disk_offset, rv[i].csum_buf, iov, n_iov);
}
if (!ok)
{
@ -867,6 +914,14 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
{
for (auto & vec: rv)
{
if (vec.copy_flags & COPY_BUF_META_BLOCK)
{
// Metadata read. Skip
assert(!meta_block);
meta_block = vec.buf;
vec.buf = NULL;
continue;
}
if (vec.csum_buf)
{
uint32_t *csum = (uint32_t*)vec.csum_buf;
@ -879,7 +934,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
"Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n",
op->oid.inode, op->oid.stripe, op->version,
(vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
crc32c(0, (uint8_t*)op->buf + vec.offset + p, dsk.csum_block_size), *csum
crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
);
op->retval = -EDOM;
break;
@ -893,6 +948,12 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
}
}
}
if (meta_block)
{
// Free after checking
free(meta_block);
meta_block = NULL;
}
}
if (PRIV(op)->clean_version_used)
{

View File

@ -145,6 +145,11 @@ resume_3:
// Mark object corrupted and retry
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
if (cur_op->rmw_buf)
{
free(cur_op->rmw_buf);
cur_op->rmw_buf = NULL;
}
goto retry_1;
}
deref_object_state(pg, &op_data->object_state, true);