Verify checksums during journal reads

hotfix-1.0.0
Vitaliy Filippov 2023-01-19 02:23:27 +03:00
parent a166147110
commit 213a9ccb4d
10 changed files with 295 additions and 195 deletions

View File

@ -191,6 +191,7 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
// required metadata size
block_count = data_len / data_block_size;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
meta_version = BLOCKSTORE_META_VERSION_V2;
if (!skip_meta_check && meta_area_size < meta_len)
{
if (!data_csum_type && !meta_version)
@ -201,6 +202,7 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
if (meta_area_size >= meta_v0_len)
{
// Old metadata fits.
printf("Warning: Forcing metadata format without checksums because the new format doesn't fit into provided area\n");
clean_entry_size = clean_entry_v0_size;
meta_len = meta_v0_len;
meta_version = BLOCKSTORE_META_VERSION_V1;

View File

@ -475,7 +475,7 @@ resume_2:
if (copy_count && !bs->journal.inmemory && wait_journal_count > 0)
{
wait_state = wait_base+12;
goto resume_12;
return false;
}
// Submit data writes
for (it = v.begin(); it != v.end(); it++)
@ -952,11 +952,6 @@ void journal_flusher_co::scan_dirty()
}
if (bs->dsk.csum_block_size)
{
if (offset % bs->dsk.csum_block_size || submit_len % bs->dsk.csum_block_size)
{
// Small write not aligned for checksums. We may have to pad it
fill_incomplete = true;
}
// FIXME Remove this > sizeof(void*) inline perversion from everywhere.
// I think it doesn't matter but I couldn't stop myself from implementing it :)
uint64_t dyn_size = bs->dsk.dirty_dyn_size(dirty_it->second.offset, dirty_it->second.len);
@ -964,38 +959,15 @@ void journal_flusher_co::scan_dirty()
bs->dsk.clean_entry_bitmap_size;
it->csum_buf = dyn_from + (it->offset/bs->dsk.csum_block_size -
dirty_it->second.offset/bs->dsk.csum_block_size) * (bs->dsk.data_csum_type & 0xFF);
if (!bs->journal.inmemory)
if (offset % bs->dsk.csum_block_size || submit_len % bs->dsk.csum_block_size)
{
if (offset < blk_end)
// Small write not aligned for checksums. We may have to pad it
fill_incomplete = true;
if (!bs->journal.inmemory)
{
// Already being read as a part of the previous checksum block series
it->buf = blk_buf + offset - blk_begin;
it->copy_flags |= COPY_BUF_COALESCED;
if (offset+submit_len > blk_end)
it->len = blk_end-offset;
}
else if (offset % bs->dsk.csum_block_size || submit_len % bs->dsk.csum_block_size)
{
// We don't use fill_partial_checksum_blocks for journal because journal writes never have holes (internal bitmap)
blk_begin = (offset/bs->dsk.csum_block_size) * bs->dsk.csum_block_size;
blk_begin = blk_begin < dirty_it->second.offset ? dirty_it->second.offset : blk_begin;
blk_end = ((offset+submit_len-1)/bs->dsk.csum_block_size + 1) * bs->dsk.csum_block_size;
blk_end = blk_end > end_offset ? end_offset : blk_end;
if (blk_begin < offset || blk_end > offset+submit_len)
{
blk_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, blk_end-blk_begin);
it->buf = blk_buf + offset - blk_begin;
it->copy_flags |= COPY_BUF_COALESCED;
v.push_back((copy_buffer_t){
.copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL,
.offset = blk_begin,
.len = blk_end-blk_begin,
.disk_offset = dirty_it->second.location + blk_begin - dirty_it->second.offset,
.buf = blk_buf,
.csum_buf = (dyn_from + (blk_begin/bs->dsk.csum_block_size -
dirty_it->second.offset/bs->dsk.csum_block_size) * (bs->dsk.data_csum_type & 0xFF)),
});
}
bs->pad_journal_read(v, *it, dirty_it->second.offset,
dirty_it->second.offset + dirty_it->second.len, dirty_it->second.location,
dyn_from, offset, submit_len, blk_begin, blk_end, blk_buf);
}
}
}
@ -1036,6 +1008,23 @@ void journal_flusher_co::scan_dirty()
if (fill_incomplete && !clean_init_bitmap)
{
// Rescan and fill incomplete writes with old data to calculate checksums
if (old_clean_loc == UINT64_MAX)
{
// May happen if the metadata entry is corrupt, but journal isn't
// FIXME: Report corrupted object to the upper layer (OSD)
printf(
"Warning: object %lx:%lx has overwrites, but doesn't have a clean version."
" Metadata is likely corrupted. Dropping object from the DB.\n",
cur.oid.inode, cur.oid.stripe
);
v.clear();
has_writes = false;
has_delete = skip_copy = true;
copy_count = 0;
fill_incomplete = false;
read_to_fill_incomplete = 0;
return;
}
uint8_t *bmp_ptr = bs->get_clean_entry_bitmap(old_clean_loc, 0);
uint64_t fulfilled = 0;
read_to_fill_incomplete = bs->fill_partial_checksum_blocks(

View File

@ -333,12 +333,16 @@ class blockstore_impl_t
std::function<int(int, bool, uint32_t, uint32_t)> callback);
int fulfill_read(blockstore_op_t *read_op, uint64_t & fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector, uint8_t *csum);
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, uint64_t clean_loc, uint64_t clean_ver);
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr,
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
bool read_clean_checksum_block(blockstore_op_t *op, int rv_pos,
uint64_t &fulfilled, uint64_t clean_loc, uint32_t item_start, uint32_t item_end);
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,

View File

@ -137,7 +137,7 @@ resume_1:
hdr->header_csum = 0;
if (crc32c(0, hdr, sizeof(*hdr)) != csum)
{
printf("Metadata header is corrupt (CRC mismatch).\n");
printf("Metadata header is corrupt (checksum mismatch).\n");
exit(1);
}
hdr->header_csum = csum;
@ -153,6 +153,7 @@ resume_1:
bs->dsk.meta_len = (1 + (bs->dsk.block_count - 1 + bs->dsk.meta_block_size / bs->dsk.clean_entry_size)
/ (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
bs->dsk.meta_version = BLOCKSTORE_META_VERSION_V1;
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
}
else if (hdr->version > BLOCKSTORE_META_VERSION_V2)
{
@ -829,7 +830,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
}
}
else
else if (je->small_write.len > 0)
{
uint32_t *block_csums = (uint32_t*)((uint8_t*)je + sizeof(journal_entry_small_write) + bs->dsk.clean_entry_bitmap_size);
uint32_t start = je->small_write.offset / bs->dsk.csum_block_size;

View File

@ -9,12 +9,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
{
if (!len)
{
// Zero-length version - skip
return 1;
}
else if (IS_IN_FLIGHT(item_state))
{
// Write not finished yet - skip
// Zero-length read
return 1;
}
else if (IS_DELETE(item_state))
@ -23,6 +18,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
memset(buf, 0, len);
return 1;
}
assert(!IS_IN_FLIGHT(item_state));
if (journal.inmemory && IS_JOURNAL(item_state))
{
memcpy(buf, (uint8_t*)journal.buffer + offset, len);
@ -97,18 +93,21 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
uint32_t cur_start = item_start;
if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset)
{
uint64_t blk_begin = 0, blk_end = 0;
uint8_t *blk_buf = NULL;
auto & rv = PRIV(read_op)->read_vec;
cur_start = cur_start < read_op->offset ? read_op->offset : cur_start;
item_end = item_end > read_op->offset + read_op->len ? read_op->offset + read_op->len : item_end;
find_holes(PRIV(read_op)->read_vec, cur_start, item_end, [&](int pos, bool alloc, uint32_t start, uint32_t end)
find_holes(rv, cur_start, item_end, [&](int pos, bool alloc, uint32_t start, uint32_t end)
{
if (alloc)
if (!r || alloc)
return 0;
copy_buffer_t el = {
.copy_flags = (IS_JOURNAL(item_state) ? COPY_BUF_JOURNAL : COPY_BUF_DATA),
.offset = start,
.len = end-start,
.disk_offset = item_location + el.offset - item_start,
.journal_sector = journal_sector,
.journal_sector = (IS_JOURNAL(item_state) ? journal_sector : 0),
.csum_buf = !csum ? NULL : (csum + (cur_start - item_start) / dsk.csum_block_size * (dsk.data_csum_type & 0xFF)),
};
if (IS_BIG_WRITE(item_state))
@ -118,17 +117,27 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
// Very improbable, but possible
PRIV(read_op)->clean_version_used = 1;
}
PRIV(read_op)->read_vec.insert(PRIV(read_op)->read_vec.begin() + pos, el);
rv.insert(rv.begin() + pos, el);
if (!journal.inmemory && dsk.csum_block_size > dsk.bitmap_granularity && IS_JOURNAL(item_state) && !IS_DELETE(item_state))
{
int pad_state = pad_journal_read(rv, rv[pos], item_start, item_end, item_location,
csum, start, end-start, blk_begin, blk_end, blk_buf);
if (pad_state == 2)
return 1;
else if (pad_state == 1)
{
// Submit the journal checksum block read
if (!read_checksum_block(read_op, 1, fulfilled, item_location))
r = 0;
return 1;
}
}
fulfilled += el.len;
if (!fulfill_read_push(read_op,
(uint8_t*)read_op->buf + el.offset - read_op->offset,
item_location + el.offset - item_start,
el.len, item_state, item_version))
{
PRIV(read_op)->read_vec.clear();
r = 0;
return 0;
}
fulfilled += el.len;
return 1;
});
}
@ -240,10 +249,11 @@ bool blockstore_impl_t::read_range_fulfilled(std::vector<copy_buffer_t> & rv, ui
return all_done;
}
bool blockstore_impl_t::read_clean_checksum_block(blockstore_op_t *op, int rv_pos,
uint64_t &fulfilled, uint64_t clean_loc, uint32_t item_start, uint32_t item_end)
bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc)
{
auto & rv = PRIV(op)->read_vec;
auto *vi = &rv[rv.size()-rv_pos];
uint32_t item_start = vi->offset, item_end = vi->offset+vi->len;
uint32_t fill_size = 0;
int n_iov = 0;
find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
@ -271,31 +281,32 @@ bool blockstore_impl_t::read_clean_checksum_block(blockstore_op_t *op, int rv_po
.copy_flags = COPY_BUF_DATA,
.offset = cur_start,
.len = cur_end-cur_start,
.disk_offset = clean_loc,
});
fulfilled += cur_end-cur_start;
return 1;
}
return 0;
});
vi = &rv[rv.size()-rv_pos];
// Save buf into read_vec too but in a creepy way
// FIXME: Shit, something else should be invented %)
rv[rv.size()-rv_pos] = (copy_buffer_t){
.copy_flags = COPY_BUF_CSUM_FILL,
*vi = (copy_buffer_t){
.copy_flags = vi->copy_flags,
.offset = 0xffffffff,
.len = ((uint64_t)n_iov << 32) | fill_size,
.disk_offset = clean_loc + item_start,
.csum_buf = (uint8_t*)buf,
.buf = (uint8_t*)buf,
.csum_buf = vi->csum_buf,
};
int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd);
uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset);
uint32_t d_pos = 0;
for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX)
{
int n_cur = n_iov-n_pos < IOV_MAX ? n_iov-n_pos : IOV_MAX;
BS_SUBMIT_GET_SQE(sqe, data);
PRIV(op)->pending_ops++;
my_uring_prep_readv(
sqe, dsk.data_fd, iov + n_pos, n_cur, dsk.data_offset + clean_loc + d_pos
);
my_uring_prep_readv(sqe, submit_fd, iov + n_pos, n_cur, submit_offset + clean_loc + d_pos);
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
if (n_pos > 0 || n_pos + IOV_MAX < n_iov)
{
@ -308,14 +319,17 @@ bool blockstore_impl_t::read_clean_checksum_block(blockstore_op_t *op, int rv_po
else
data->iov.iov_len = item_end-item_start;
}
// Reading may race with flushing.
// - Flushing happens in 3 steps: (2) punch holes in meta -> (4) update data -> (6) update meta
// - Reading may start/end at: 1/3, 1/5, 1/7, 3/5, 3/7, 5/7
// - 1/3, 1/5, 3/5 are not a problem because we'll check data using punched bitmap and CRCs
// - For 1/7, 3/7 and 5/7 to finish correctly we need a copy of punched metadata
// otherwise the checksum may not match
// So flushers save a copy of punched metadata if the object is being read during (6).
PRIV(op)->clean_version_used = 1;
if (!(vi->copy_flags & COPY_BUF_JOURNAL))
{
// Reading may race with flushing.
// - Flushing happens in 3 steps: (2) punch holes in meta -> (4) update data -> (6) update meta
// - Reading may start/end at: 1/3, 1/5, 1/7, 3/5, 3/7, 5/7
// - 1/3, 1/5, 3/5 are not a problem because we'll check data using punched bitmap and CRCs
// - For 1/7, 3/7 and 5/7 to finish correctly we need a copy of punched metadata
// otherwise the checksum may not match
// So flushers save a copy of punched metadata if the object is being read during (6).
PRIV(op)->clean_version_used = 1;
}
return true;
}
@ -341,6 +355,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
uint64_t fulfilled = 0;
PRIV(read_op)->pending_ops = 0;
PRIV(read_op)->clean_version_used = 0;
auto & rv = PRIV(read_op)->read_vec;
uint64_t result_version = 0;
if (dirty_found)
{
@ -369,15 +384,23 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
}
// If inmemory_journal is false, journal trim will have to wait until the read is completed
// FIXME: Verify checksums when reading from journal disk
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset),
(IS_JOURNAL(dirty.state) ? dirty.journal_sector+1 : 0),
journal.inmemory ? NULL : bmp_ptr+dsk.clean_entry_bitmap_size))
if (!IS_JOURNAL(dirty.state))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
// Read from data disk, possibly checking checksums
if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dirty.location, dirty_it->first.version))
{
goto undo_read;
}
}
else
{
// Copy from memory or read from journal, possibly checking checksums
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location, dirty.journal_sector+1,
journal.inmemory ? NULL : bmp_ptr+dsk.clean_entry_bitmap_size))
{
goto undo_read;
}
}
}
if (fulfilled == read_op->len || dirty_it == dirty_db.begin())
@ -401,84 +424,9 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (fulfilled < read_op->len)
{
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0);
if (!dsk.clean_entry_bitmap_size)
if (!fulfill_clean_read(read_op, fulfilled, clean_entry_bitmap, clean_it->second.location, clean_it->second.version))
{
if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size,
(BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location, 0,
clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
}
else if (dsk.csum_block_size > dsk.bitmap_granularity)
{
auto & rv = PRIV(read_op)->read_vec;
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap,
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
for (int i = req; i > 0; i--)
{
auto & vi = rv[rv.size()-i];
if (!read_clean_checksum_block(read_op, i, fulfilled, clean_it->second.location, vi.offset, vi.offset+vi.len))
{
// need to wait. undo added requests, don't dequeue op
for (auto & vec: rv)
{
if (vec.copy_flags == COPY_BUF_CSUM_FILL && vec.csum_buf)
{
free(vec.csum_buf);
vec.csum_buf = NULL;
}
}
rv.clear();
return 0;
}
}
}
else
{
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL));
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap +
2*dsk.clean_entry_bitmap_size +
bmp_start*dsk.bitmap_granularity/dsk.csum_block_size*(dsk.data_csum_type & 0xFF));
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_it->second.location + bmp_start * dsk.bitmap_granularity, 0, csum))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
bmp_start = bmp_end;
}
}
}
// Increment counter if clean data is being read from the disk
if (PRIV(read_op)->clean_version_used)
{
obj_ver_id ov = { .oid = read_op->oid, .version = clean_it->second.version };
used_clean_objects[ov].refs++;
PRIV(read_op)->clean_version_used = ov.version;
goto undo_read;
}
}
}
@ -490,11 +438,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
FINISH_OP(read_op);
return 2;
}
if (fulfilled < read_op->len)
{
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL));
assert(fulfilled == read_op->len);
}
assert(fulfilled == read_op->len);
read_op->version = result_version;
if (!PRIV(read_op)->pending_ops)
{
@ -519,6 +463,127 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
read_op->retval = 0;
return 2;
undo_read:
// need to wait. undo added requests, don't dequeue op
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
for (auto & vec: rv)
{
if ((vec.copy_flags & COPY_BUF_CSUM_FILL) && vec.buf)
{
free(vec.buf);
vec.buf = NULL;
}
}
}
rv.clear();
return 0;
}
int blockstore_impl_t::pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
// FIXME Passing dirty_entry& would be nicer
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr,
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf)
{
if (offset % dsk.csum_block_size || submit_len % dsk.csum_block_size)
{
if (offset < blk_end)
{
// Already being read as a part of the previous checksum block series
cp.buf = blk_buf + offset - blk_begin;
cp.copy_flags |= COPY_BUF_COALESCED;
if (offset+submit_len > blk_end)
cp.len = blk_end-offset;
return 2;
}
else
{
// We don't use fill_partial_checksum_blocks for journal because journal writes never have holes (internal bitmap)
blk_begin = (offset/dsk.csum_block_size) * dsk.csum_block_size;
blk_begin = blk_begin < dirty_offset ? dirty_offset : blk_begin;
blk_end = ((offset+submit_len-1)/dsk.csum_block_size + 1) * dsk.csum_block_size;
blk_end = blk_end > dirty_end ? dirty_end : blk_end;
if (blk_begin < offset || blk_end > offset+submit_len)
{
blk_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, blk_end-blk_begin);
cp.buf = blk_buf + offset - blk_begin;
cp.copy_flags |= COPY_BUF_COALESCED;
rv.push_back((copy_buffer_t){
.copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL,
.offset = blk_begin,
.len = blk_end-blk_begin,
.disk_offset = dirty_loc + blk_begin - dirty_offset,
.buf = blk_buf,
.csum_buf = (csum_ptr + (blk_begin/dsk.csum_block_size -
dirty_offset/dsk.csum_block_size) * (dsk.data_csum_type & 0xFF)),
});
return 1;
}
}
}
return 0;
}
bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, uint64_t clean_loc, uint64_t clean_ver)
{
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
auto & rv = PRIV(read_op)->read_vec;
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap,
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
for (int i = req; i > 0; i--)
{
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
{
return false;
}
}
PRIV(read_op)->clean_version_used = req > 0;
}
else
{
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL));
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap +
2*dsk.clean_entry_bitmap_size +
bmp_start*dsk.bitmap_granularity/dsk.csum_block_size*(dsk.data_csum_type & 0xFF));
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum))
{
return false;
}
bmp_start = bmp_end;
}
}
}
// Increment reference counter if clean data is being read from the disk
if (PRIV(read_op)->clean_version_used)
{
obj_ver_id ov = { .oid = read_op->oid, .version = clean_ver };
used_clean_objects[ov].refs++;
PRIV(read_op)->clean_version_used = ov.version;
}
return true;
}
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint32_t offset,
@ -646,13 +711,16 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
auto & rv = PRIV(op)->read_vec;
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
for (int i = rv.size()-1; i >= 0 && rv[i].copy_flags == COPY_BUF_CSUM_FILL; i--)
for (int i = rv.size()-1; i >= 0 && (rv[i].copy_flags & COPY_BUF_CSUM_FILL); i--)
{
struct iovec *iov = (struct iovec*)(rv[i].csum_buf + (rv[i].len & 0xFFFFFFFF));
if (!verify_read_padded_checksums(op, rv[i].disk_offset, iov, rv[i].len >> 32))
struct iovec *iov = (struct iovec*)(rv[i].buf + (rv[i].len & 0xFFFFFFFF));
int n_iov = rv[i].len >> 32;
if (!((rv[i].copy_flags & COPY_BUF_JOURNAL)
? verify_journal_checksums(rv[i].csum_buf, rv[i].disk_offset % dsk.data_block_size, iov, n_iov, NULL)
: verify_read_padded_checksums(op, rv[i].disk_offset, iov, n_iov)))
op->retval = -EDOM;
free(rv[i].csum_buf);
rv[i].csum_buf = NULL;
free(rv[i].buf);
rv[i].buf = NULL;
}
}
else

View File

@ -194,9 +194,12 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
data_csums[0] = fn(0, op->buf, op->len, op->offset - start*dsk.csum_block_size, end*dsk.csum_block_size - (op->offset+op->len));
else
{
// First block
data_csums[0] = fn(0, op->buf, dsk.csum_block_size*(start+1)-op->offset, op->offset - start*dsk.csum_block_size, 0);
// Intermediate blocks
for (uint32_t i = start+1; i < end; i++)
data_csums[i-start] = crc32c(0, (uint8_t*)op->buf + dsk.csum_block_size*i-op->offset, dsk.csum_block_size);
// Last block
data_csums[end-start] = fn(
0, (uint8_t*)op->buf + end*dsk.csum_block_size - op->offset,
op->offset+op->len - end*dsk.csum_block_size,

View File

@ -199,16 +199,44 @@ int disk_tool_t::process_journal_block(void *buf, std::function<void(int, journa
assert(pread(dsk.journal_fd, small_write_data, je->small_write.len, dsk.journal_offset+je->small_write.data_offset) == je->small_write.len);
data_crc32 = je_start.csum_block_size ? 0 : crc32c(0, small_write_data, je->small_write.len);
data_csum_valid = (data_crc32 == je->small_write.crc32_data);
if (je_start.csum_block_size)
if (je_start.csum_block_size && je->small_write.len > 0)
{
uint32_t data_csum_size = je->small_write.len/je_start.csum_block_size*(je_start.data_csum_type & 0xFF);
uint32_t *block_csums = (uint32_t*)((uint8_t*)je + je->size - data_csum_size);
for (uint32_t pos = 0; pos < je->small_write.len; pos += je_start.csum_block_size, block_csums++)
// like in enqueue_write()
uint32_t start = je->small_write.offset / je_start.csum_block_size;
uint32_t end = (je->small_write.offset+je->small_write.len-1) / je_start.csum_block_size;
uint32_t data_csum_size = (end-start+1) * (je_start.data_csum_type & 0xFF);
if (je->size < sizeof(journal_entry_small_write) + data_csum_size)
{
if (crc32c(0, (uint8_t*)small_write_data + pos, je_start.csum_block_size) != *block_csums)
data_csum_valid = false;
}
else
{
uint32_t calc_csum = 0;
uint32_t *block_csums = (uint32_t*)((uint8_t*)je + je->size - data_csum_size);
if (start == end)
{
data_csum_valid = false;
break;
calc_csum = crc32c(0, (uint8_t*)small_write_data, je->small_write.len);
data_csum_valid = data_csum_valid && (calc_csum == *block_csums++);
}
else
{
// First block
calc_csum = crc32c(0, (uint8_t*)small_write_data,
je_start.csum_block_size*(start+1)-je->small_write.offset);
data_csum_valid = data_csum_valid && (calc_csum == *block_csums++);
// Intermediate blocks
for (uint32_t i = start+1; i < end; i++)
{
calc_csum = crc32c(0, (uint8_t*)small_write_data +
je_start.csum_block_size*i-je->small_write.offset, je_start.csum_block_size);
data_csum_valid = data_csum_valid && (calc_csum == *block_csums++);
}
// Last block
calc_csum = crc32c(
0, (uint8_t*)small_write_data + end*je_start.csum_block_size - je->small_write.offset,
je->small_write.offset+je->small_write.len - end*je_start.csum_block_size
);
data_csum_valid = data_csum_valid && (calc_csum == *block_csums++);
}
}
}
@ -265,20 +293,22 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
}
else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
auto & sw = je->small_write;
printf(
json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
: "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe,
je->small_write.version, je->small_write.offset, je->small_write.len,
je->small_write.data_offset
sw.oid.inode, sw.oid.stripe, sw.version, sw.offset, sw.len, sw.data_offset
);
if (journal_calc_data_pos != je->small_write.data_offset)
if (journal_calc_data_pos != sw.data_offset)
{
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
: " (mismatched, calculated = %lu)", journal_pos);
}
uint32_t data_csum_size = (!je_start.csum_block_size ? 0 : je->small_write.len/je_start.csum_block_size*(je_start.data_csum_type & 0xFF));
uint32_t data_csum_size = (!je_start.csum_block_size
? 0
: ((sw.offset + sw.len - 1)/je_start.csum_block_size - sw.offset/je_start.csum_block_size + 1)
*(je_start.data_csum_type & 0xFF));
if (je->size > sizeof(journal_entry_small_write) + data_csum_size)
{
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
@ -291,13 +321,13 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
if (dump_with_data)
{
printf(json ? ",\"data\":\"" : " (data: ");
for (int i = 0; i < je->small_write.len; i++)
for (int i = 0; i < sw.len; i++)
{
printf("%02x", ((uint8_t*)small_write_data)[i]);
}
printf(json ? "\"" : ")");
}
if (data_csum_size > 0)
if (data_csum_size > 0 && je->size >= sizeof(journal_entry_small_write) + data_csum_size)
{
printf(json ? ",\"block_csums\":\"" : " block_csums=");
uint8_t *block_csums = (uint8_t*)je + je->size - data_csum_size;
@ -307,27 +337,29 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
}
else
{
printf(json ? ",\"data_crc32\":\"%08x\"" : " data_crc32=%08x", je->small_write.crc32_data);
printf(json ? ",\"data_crc32\":\"%08x\"" : " data_crc32=%08x", sw.crc32_data);
}
printf(
json ? ",\"data_valid\":%s}" : "%s\n",
(data_csum_valid
? (json ? "false" : " (invalid)")
: (json ? "true" : " (valid)"))
? (json ? "true" : " (valid)")
: (json ? "false" : " (invalid)"))
);
}
else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
auto & bw = je->big_write;
printf(
json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
: "je_big_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
je->big_write.oid.inode, je->big_write.oid.stripe,
je->big_write.version, je->big_write.offset, je->big_write.len,
je->big_write.location
bw.oid.inode, bw.oid.stripe, bw.version, bw.offset, bw.len, bw.location
);
uint32_t data_csum_size = (!je_start.csum_block_size ? 0 : je->big_write.len/je_start.csum_block_size*(je_start.data_csum_type & 0xFF));
if (data_csum_size > 0)
uint32_t data_csum_size = (!je_start.csum_block_size
? 0
: ((bw.offset + bw.len - 1)/je_start.csum_block_size - bw.offset/je_start.csum_block_size + 1)
*(je_start.data_csum_type & 0xFF));
if (data_csum_size > 0 && je->size >= sizeof(journal_entry_big_write) + data_csum_size)
{
printf(json ? ",\"block_csums\":\"" : " block_csums=");
uint8_t *block_csums = (uint8_t*)je + je->size - data_csum_size;
@ -335,10 +367,10 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
printf("%02x", block_csums[i]);
printf(json ? "\"" : "");
}
if (je->big_write.size > sizeof(journal_entry_big_write) + data_csum_size)
if (bw.size > sizeof(journal_entry_big_write) + data_csum_size)
{
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
for (int i = sizeof(journal_entry_big_write); i < je->big_write.size - data_csum_size; i++)
for (int i = sizeof(journal_entry_big_write); i < bw.size - data_csum_size; i++)
{
printf("%02x", ((uint8_t*)je)[i]);
}

View File

@ -33,14 +33,14 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
{
if (hdr->version == BLOCKSTORE_META_VERSION_V1)
{
// Vitastor 0.6-0.7 - static array of clean_disk_entry with bitmaps
// Vitastor 0.6-0.8 - static array of clean_disk_entry with bitmaps
hdr->data_csum_type = 0;
hdr->csum_block_size = 0;
hdr->header_csum = 0;
}
else if (hdr->version == BLOCKSTORE_META_VERSION_V2)
{
// Vitastor 0.8 - static array of clean_disk_entry with bitmaps and checksums
// Vitastor 0.9 - static array of clean_disk_entry with bitmaps and checksums
if (hdr->data_csum_type != 0 &&
hdr->data_csum_type != BLOCKSTORE_CSUM_CRC32C)
{
@ -169,7 +169,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
else if (hdr->version == BLOCKSTORE_META_VERSION_V2)
{
printf(
"{\"version\":\"0.8\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,"
"{\"version\":\"0.9\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,"
"\"data_csum_type\":%s,\"csum_block_size\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size

View File

@ -49,7 +49,7 @@ kill_osds()
kill_osds &
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bsrange=4k-128k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bsrange=4k-128k -blockalign=4k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \
-randrepeat=0 -refill_buffers=1 -mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
qemu-img convert -S 4096 -p \

1
tests/test_heal_csum.sh Executable file
View File

@ -0,0 +1 @@
OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh