Support keeping checksums on disk (not in memory)
Definitely beneficial for SSD+HDD setupshotfix-1.0.0
parent
819cb70cdd
commit
a8464c19af
|
@ -666,7 +666,10 @@ void journal_flusher_co::update_metadata_entry()
|
|||
new_entry->oid = cur.oid;
|
||||
new_entry->version = cur.version;
|
||||
if (!bs->inmemory_meta)
|
||||
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_dyn_size);
|
||||
{
|
||||
auto inmem_bmp = (uint8_t*)bs->clean_bitmaps + (clean_loc >> bs->dsk.block_order)*2*bs->dsk.clean_entry_bitmap_size;
|
||||
memcpy(inmem_bmp, new_clean_bitmap, 2*bs->dsk.clean_entry_bitmap_size);
|
||||
}
|
||||
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
// Calculate metadata entry checksum
|
||||
|
@ -767,7 +770,8 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
|||
{
|
||||
assert(!(v[i].offset % bs->dsk.csum_block_size));
|
||||
assert(!(v[i].len % bs->dsk.csum_block_size));
|
||||
bs->verify_padded_checksums(new_clean_bitmap, false, v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
bs->verify_padded_checksums(new_clean_bitmap, new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size,
|
||||
v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
{
|
||||
printf("Checksum mismatch in object %lx:%lx v%lu in data area at offset 0x%lx+0x%x: got %08x, expected %08x\n",
|
||||
cur.oid.inode, cur.oid.stripe, old_clean_ver, old_clean_loc, bad_block, calc_csum, stored_csum);
|
||||
|
@ -1131,9 +1135,7 @@ bool journal_flusher_co::modify_meta_do_reads(int wait_base)
|
|||
resume_0:
|
||||
if (!modify_meta_read(clean_loc, meta_new, wait_base+0))
|
||||
return false;
|
||||
new_clean_bitmap = (bs->inmemory_meta
|
||||
? (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry)
|
||||
: (uint8_t*)bs->clean_dyn_data + (clean_loc >> bs->dsk.block_order)*bs->dsk.clean_dyn_size);
|
||||
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry);
|
||||
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||
{
|
||||
resume_1:
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
#define COPY_BUF_ZERO 4
|
||||
#define COPY_BUF_CSUM_FILL 8
|
||||
#define COPY_BUF_COALESCED 16
|
||||
#define COPY_BUF_META_BLOCK 32
|
||||
#define COPY_BUF_JOURNALED_BIG 64
|
||||
|
||||
struct copy_buffer_t
|
||||
{
|
||||
|
|
|
@ -39,8 +39,8 @@ blockstore_impl_t::~blockstore_impl_t()
|
|||
dsk.close_all();
|
||||
if (metadata_buffer)
|
||||
free(metadata_buffer);
|
||||
if (clean_dyn_data)
|
||||
free(clean_dyn_data);
|
||||
if (clean_bitmaps)
|
||||
free(clean_bitmaps);
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::is_started()
|
||||
|
|
|
@ -269,7 +269,7 @@ class blockstore_impl_t
|
|||
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
uint8_t *clean_dyn_data = NULL;
|
||||
uint8_t *clean_bitmaps = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||
|
@ -347,11 +347,12 @@ class blockstore_impl_t
|
|||
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
|
||||
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
|
||||
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
|
||||
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, bool is_journal, uint32_t offset,
|
||||
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
|
||||
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, iovec *iov, int n_iov);
|
||||
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov);
|
||||
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
|
||||
uint32_t item_state, uint64_t item_version);
|
||||
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
|
||||
|
|
|
@ -334,7 +334,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
|||
}
|
||||
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
|
||||
{
|
||||
memcpy(bs->clean_dyn_data + (done_cnt+i)*bs->dsk.clean_dyn_size, &entry->bitmap, bs->dsk.clean_dyn_size);
|
||||
memcpy(bs->clean_bitmaps + (done_cnt+i) * 2 * bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2 * bs->dsk.clean_entry_bitmap_size);
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(entry->oid);
|
||||
auto clean_it = clean_db.find(entry->oid);
|
||||
|
|
|
@ -139,13 +139,12 @@ void blockstore_impl_t::calc_lengths()
|
|||
}
|
||||
else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
|
||||
{
|
||||
// FIXME: allow to store bitmap, but read checksums from the disk
|
||||
clean_dyn_data = (uint8_t*)malloc(dsk.block_count * dsk.clean_dyn_size);
|
||||
if (!clean_dyn_data)
|
||||
clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
|
||||
if (!clean_bitmaps)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Failed to allocate memory for the metadata sparse write bitmap ("+
|
||||
std::to_string(dsk.block_count * dsk.clean_dyn_size / 1024 / 1024)+" MB)"
|
||||
std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -159,7 +159,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
|
|||
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
|
||||
}
|
||||
else
|
||||
clean_entry_bitmap = (uint8_t*)(clean_dyn_data + meta_loc*dsk.clean_dyn_size + offset);
|
||||
clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
|
||||
return clean_entry_bitmap;
|
||||
}
|
||||
|
||||
|
@ -196,7 +196,7 @@ int blockstore_impl_t::fill_partial_checksum_blocks(std::vector<copy_buffer_t> &
|
|||
end_block++;
|
||||
// OK, mark this range as required
|
||||
rv.push_back((copy_buffer_t){
|
||||
.copy_flags = COPY_BUF_CSUM_FILL,
|
||||
.copy_flags = COPY_BUF_CSUM_FILL | (from_journal ? COPY_BUF_JOURNALED_BIG : 0),
|
||||
.offset = start_block*dsk.csum_block_size,
|
||||
.len = (end_block-start_block)*dsk.csum_block_size,
|
||||
// save clean_entry_bitmap if we're reading clean data from the journal
|
||||
|
@ -601,6 +601,15 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
|||
auto & rv = PRIV(read_op)->read_vec;
|
||||
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
|
||||
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
|
||||
if (!inmemory_meta && !from_journal && req > 0)
|
||||
{
|
||||
// Read checksums from disk
|
||||
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
|
||||
for (int i = req; i > 0; i--)
|
||||
{
|
||||
rv[rv.size()-i].csum_buf = csum_buf;
|
||||
}
|
||||
}
|
||||
for (int i = req; i > 0; i--)
|
||||
{
|
||||
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
|
||||
|
@ -633,6 +642,8 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
|||
}
|
||||
else
|
||||
{
|
||||
bool csum_done = !dsk.csum_block_size || inmemory_meta;
|
||||
uint8_t *csum_buf = clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size;
|
||||
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
|
||||
while (bmp_start < bmp_size)
|
||||
{
|
||||
|
@ -653,8 +664,13 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
|||
}
|
||||
if (bmp_end > bmp_start)
|
||||
{
|
||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size +
|
||||
bmp_start*dsk.bitmap_granularity/dsk.csum_block_size*(dsk.data_csum_type & 0xFF));
|
||||
if (!csum_done)
|
||||
{
|
||||
// Read checksums from disk
|
||||
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
|
||||
csum_done = true;
|
||||
}
|
||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + bmp_start*(dsk.data_csum_type & 0xFF));
|
||||
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
|
||||
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
|
||||
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
|
||||
|
@ -675,11 +691,31 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
|||
return true;
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, bool is_journal, uint32_t offset,
|
||||
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
|
||||
{
|
||||
auto & rv = PRIV(op)->read_vec;
|
||||
auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||
auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
|
||||
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
||||
rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
|
||||
.copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
|
||||
.offset = pos,
|
||||
.buf = buf,
|
||||
});
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
data->iov = (struct iovec){ buf, dsk.meta_block_size };
|
||||
PRIV(op)->pending_ops++;
|
||||
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
|
||||
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
||||
// return pointer to checksums
|
||||
return buf + pos + sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size;
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
|
||||
{
|
||||
assert(!(offset % dsk.csum_block_size));
|
||||
uint32_t *csums = (uint32_t*)(clean_entry_bitmap + (is_journal ? 1 : 2)*dsk.clean_entry_bitmap_size);
|
||||
uint32_t *csums = (uint32_t*)csum_buf;
|
||||
uint32_t block_csum = 0;
|
||||
uint32_t block_done = 0;
|
||||
uint32_t block_num = clean_entry_bitmap ? offset/dsk.csum_block_size : 0;
|
||||
|
@ -767,18 +803,19 @@ bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset
|
|||
return true;
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, iovec *iov, int n_iov)
|
||||
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *csum_buf, iovec *iov, int n_iov)
|
||||
{
|
||||
uint32_t offset = clean_loc % dsk.data_block_size;
|
||||
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
|
||||
// First verify against the newest checksum version
|
||||
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
|
||||
if (verify_padded_checksums(clean_entry_bitmap, false, offset, iov, n_iov, NULL))
|
||||
if (verify_padded_checksums(clean_entry_bitmap, csum_buf ? csum_buf : (clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size), offset, iov, n_iov, NULL))
|
||||
return true;
|
||||
// Check through all relevant "metadata backups" possibly added by flushers
|
||||
auto mb_it = used_clean_objects.lower_bound((obj_ver_id){ .oid = op->oid, .version = PRIV(op)->clean_version_used });
|
||||
for (; mb_it != used_clean_objects.end() && mb_it->first.oid == op->oid; mb_it++)
|
||||
if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta, false, offset, iov, n_iov, NULL))
|
||||
if (mb_it->second.meta != NULL && verify_padded_checksums(mb_it->second.meta,
|
||||
mb_it->second.meta + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, NULL))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
@ -798,13 +835,22 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
{
|
||||
// verify checksums if required
|
||||
auto & rv = PRIV(op)->read_vec;
|
||||
void *meta_block = NULL;
|
||||
if (dsk.csum_block_size > dsk.bitmap_granularity)
|
||||
{
|
||||
bool ok = true;
|
||||
for (int i = rv.size()-1; i >= 0 && (rv[i].copy_flags & COPY_BUF_CSUM_FILL); i--)
|
||||
{
|
||||
if (rv[i].copy_flags & COPY_BUF_META_BLOCK)
|
||||
{
|
||||
// Metadata read. Skip
|
||||
assert(!meta_block);
|
||||
meta_block = rv[i].buf;
|
||||
rv[i].buf = NULL;
|
||||
continue;
|
||||
}
|
||||
struct iovec *iov = (struct iovec*)((uint8_t*)rv[i].buf + (rv[i].len & 0xFFFFFFFF));
|
||||
int n_iov = rv[i].len >> 32;
|
||||
bool ok = true;
|
||||
if (rv[i].copy_flags & COPY_BUF_JOURNAL)
|
||||
{
|
||||
// SMALL_WRITE from journal
|
||||
|
@ -821,10 +867,11 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
}
|
||||
);
|
||||
}
|
||||
else if (rv[i].csum_buf)
|
||||
else if (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG)
|
||||
{
|
||||
// BIG_WRITE from journal
|
||||
verify_padded_checksums(rv[i].csum_buf, true, rv[i].disk_offset % dsk.data_block_size, iov, n_iov,
|
||||
verify_padded_checksums(rv[i].csum_buf, rv[i].csum_buf + dsk.clean_entry_bitmap_size,
|
||||
rv[i].disk_offset % dsk.data_block_size, iov, n_iov,
|
||||
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
{
|
||||
ok = false;
|
||||
|
@ -839,7 +886,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
else
|
||||
{
|
||||
// Clean data
|
||||
ok = verify_clean_padded_checksums(op, rv[i].disk_offset, iov, n_iov);
|
||||
ok = verify_clean_padded_checksums(op, rv[i].disk_offset, rv[i].csum_buf, iov, n_iov);
|
||||
}
|
||||
if (!ok)
|
||||
{
|
||||
|
@ -867,6 +914,14 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
{
|
||||
for (auto & vec: rv)
|
||||
{
|
||||
if (vec.copy_flags & COPY_BUF_META_BLOCK)
|
||||
{
|
||||
// Metadata read. Skip
|
||||
assert(!meta_block);
|
||||
meta_block = vec.buf;
|
||||
vec.buf = NULL;
|
||||
continue;
|
||||
}
|
||||
if (vec.csum_buf)
|
||||
{
|
||||
uint32_t *csum = (uint32_t*)vec.csum_buf;
|
||||
|
@ -879,7 +934,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
"Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
(vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
|
||||
crc32c(0, (uint8_t*)op->buf + vec.offset + p, dsk.csum_block_size), *csum
|
||||
crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
|
||||
);
|
||||
op->retval = -EDOM;
|
||||
break;
|
||||
|
@ -893,6 +948,12 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||
}
|
||||
}
|
||||
}
|
||||
if (meta_block)
|
||||
{
|
||||
// Free after checking
|
||||
free(meta_block);
|
||||
meta_block = NULL;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->clean_version_used)
|
||||
{
|
||||
|
|
|
@ -145,6 +145,11 @@ resume_3:
|
|||
// Mark object corrupted and retry
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
|
||||
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
|
||||
if (cur_op->rmw_buf)
|
||||
{
|
||||
free(cur_op->rmw_buf);
|
||||
cur_op->rmw_buf = NULL;
|
||||
}
|
||||
goto retry_1;
|
||||
}
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
|
|
Loading…
Reference in New Issue