From 213a9ccb4def20617b36c5e44d91ff3b31a457b4 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 19 Jan 2023 02:23:27 +0300 Subject: [PATCH] Verify checksums during journal reads --- src/blockstore_disk.cpp | 2 + src/blockstore_flush.cpp | 61 +++----- src/blockstore_impl.h | 8 +- src/blockstore_init.cpp | 5 +- src/blockstore_read.cpp | 322 +++++++++++++++++++++++--------------- src/blockstore_write.cpp | 3 + src/disk_tool_journal.cpp | 80 +++++++--- src/disk_tool_meta.cpp | 6 +- tests/test_heal.sh | 2 +- tests/test_heal_csum.sh | 1 + 10 files changed, 295 insertions(+), 195 deletions(-) create mode 100755 tests/test_heal_csum.sh diff --git a/src/blockstore_disk.cpp b/src/blockstore_disk.cpp index 0ea8a2e4..bab7eda1 100644 --- a/src/blockstore_disk.cpp +++ b/src/blockstore_disk.cpp @@ -191,6 +191,7 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check) // required metadata size block_count = data_len / data_block_size; meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size; + meta_version = BLOCKSTORE_META_VERSION_V2; if (!skip_meta_check && meta_area_size < meta_len) { if (!data_csum_type && !meta_version) @@ -201,6 +202,7 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check) if (meta_area_size >= meta_v0_len) { // Old metadata fits. + printf("Warning: Forcing metadata format without checksums because the new format doesn't fit into provided area\n"); clean_entry_size = clean_entry_v0_size; meta_len = meta_v0_len; meta_version = BLOCKSTORE_META_VERSION_V1; diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index c01e4be7..8fcf3987 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -475,7 +475,7 @@ resume_2: if (copy_count && !bs->journal.inmemory && wait_journal_count > 0) { wait_state = wait_base+12; - goto resume_12; + return false; } // Submit data writes for (it = v.begin(); it != v.end(); it++) @@ -952,11 +952,6 @@ void journal_flusher_co::scan_dirty() } if (bs->dsk.csum_block_size) { - if (offset % bs->dsk.csum_block_size || submit_len % bs->dsk.csum_block_size) - { - // Small write not aligned for checksums. We may have to pad it - fill_incomplete = true; - } // FIXME Remove this > sizeof(void*) inline perversion from everywhere. // I think it doesn't matter but I couldn't stop myself from implementing it :) uint64_t dyn_size = bs->dsk.dirty_dyn_size(dirty_it->second.offset, dirty_it->second.len); @@ -964,38 +959,15 @@ void journal_flusher_co::scan_dirty() bs->dsk.clean_entry_bitmap_size; it->csum_buf = dyn_from + (it->offset/bs->dsk.csum_block_size - dirty_it->second.offset/bs->dsk.csum_block_size) * (bs->dsk.data_csum_type & 0xFF); - if (!bs->journal.inmemory) + if (offset % bs->dsk.csum_block_size || submit_len % bs->dsk.csum_block_size) { - if (offset < blk_end) + // Small write not aligned for checksums. We may have to pad it + fill_incomplete = true; + if (!bs->journal.inmemory) { - // Already being read as a part of the previous checksum block series - it->buf = blk_buf + offset - blk_begin; - it->copy_flags |= COPY_BUF_COALESCED; - if (offset+submit_len > blk_end) - it->len = blk_end-offset; - } - else if (offset % bs->dsk.csum_block_size || submit_len % bs->dsk.csum_block_size) - { - // We don't use fill_partial_checksum_blocks for journal because journal writes never have holes (internal bitmap) - blk_begin = (offset/bs->dsk.csum_block_size) * bs->dsk.csum_block_size; - blk_begin = blk_begin < dirty_it->second.offset ? dirty_it->second.offset : blk_begin; - blk_end = ((offset+submit_len-1)/bs->dsk.csum_block_size + 1) * bs->dsk.csum_block_size; - blk_end = blk_end > end_offset ? end_offset : blk_end; - if (blk_begin < offset || blk_end > offset+submit_len) - { - blk_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, blk_end-blk_begin); - it->buf = blk_buf + offset - blk_begin; - it->copy_flags |= COPY_BUF_COALESCED; - v.push_back((copy_buffer_t){ - .copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL, - .offset = blk_begin, - .len = blk_end-blk_begin, - .disk_offset = dirty_it->second.location + blk_begin - dirty_it->second.offset, - .buf = blk_buf, - .csum_buf = (dyn_from + (blk_begin/bs->dsk.csum_block_size - - dirty_it->second.offset/bs->dsk.csum_block_size) * (bs->dsk.data_csum_type & 0xFF)), - }); - } + bs->pad_journal_read(v, *it, dirty_it->second.offset, + dirty_it->second.offset + dirty_it->second.len, dirty_it->second.location, + dyn_from, offset, submit_len, blk_begin, blk_end, blk_buf); } } } @@ -1036,6 +1008,23 @@ void journal_flusher_co::scan_dirty() if (fill_incomplete && !clean_init_bitmap) { // Rescan and fill incomplete writes with old data to calculate checksums + if (old_clean_loc == UINT64_MAX) + { + // May happen if the metadata entry is corrupt, but journal isn't + // FIXME: Report corrupted object to the upper layer (OSD) + printf( + "Warning: object %lx:%lx has overwrites, but doesn't have a clean version." + " Metadata is likely corrupted. Dropping object from the DB.\n", + cur.oid.inode, cur.oid.stripe + ); + v.clear(); + has_writes = false; + has_delete = skip_copy = true; + copy_count = 0; + fill_incomplete = false; + read_to_fill_incomplete = 0; + return; + } uint8_t *bmp_ptr = bs->get_clean_entry_bitmap(old_clean_loc, 0); uint64_t fulfilled = 0; read_to_fill_incomplete = bs->fill_partial_checksum_blocks( diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index 381bdefa..df223cce 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -333,12 +333,16 @@ class blockstore_impl_t std::function callback); int fulfill_read(blockstore_op_t *read_op, uint64_t & fulfilled, uint32_t item_start, uint32_t item_end, uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector, uint8_t *csum); + bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled, + uint8_t *clean_entry_bitmap, uint64_t clean_loc, uint64_t clean_ver); int fill_partial_checksum_blocks(std::vector & rv, uint64_t & fulfilled, uint8_t *clean_entry_bitmap, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end); + int pad_journal_read(std::vector & rv, copy_buffer_t & cp, + uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, + uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf); bool read_range_fulfilled(std::vector & rv, uint64_t & fulfilled, uint8_t *read_buf, uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end); - bool read_clean_checksum_block(blockstore_op_t *op, int rv_pos, - uint64_t &fulfilled, uint64_t clean_loc, uint32_t item_start, uint32_t item_end); + bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc); bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint32_t offset, iovec *iov, int n_iov, std::function bad_block_cb); bool verify_journal_checksums(uint8_t *csums, uint32_t offset, diff --git a/src/blockstore_init.cpp b/src/blockstore_init.cpp index c2175cdf..a334b11b 100644 --- a/src/blockstore_init.cpp +++ b/src/blockstore_init.cpp @@ -137,7 +137,7 @@ resume_1: hdr->header_csum = 0; if (crc32c(0, hdr, sizeof(*hdr)) != csum) { - printf("Metadata header is corrupt (CRC mismatch).\n"); + printf("Metadata header is corrupt (checksum mismatch).\n"); exit(1); } hdr->header_csum = csum; @@ -153,6 +153,7 @@ resume_1: bs->dsk.meta_len = (1 + (bs->dsk.block_count - 1 + bs->dsk.meta_block_size / bs->dsk.clean_entry_size) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size; bs->dsk.meta_version = BLOCKSTORE_META_VERSION_V1; + printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n"); } else if (hdr->version > BLOCKSTORE_META_VERSION_V2) { @@ -829,7 +830,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data); } } - else + else if (je->small_write.len > 0) { uint32_t *block_csums = (uint32_t*)((uint8_t*)je + sizeof(journal_entry_small_write) + bs->dsk.clean_entry_bitmap_size); uint32_t start = je->small_write.offset / bs->dsk.csum_block_size; diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index 00765de6..d4367dd6 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -9,12 +9,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_ { if (!len) { - // Zero-length version - skip - return 1; - } - else if (IS_IN_FLIGHT(item_state)) - { - // Write not finished yet - skip + // Zero-length read return 1; } else if (IS_DELETE(item_state)) @@ -23,6 +18,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_ memset(buf, 0, len); return 1; } + assert(!IS_IN_FLIGHT(item_state)); if (journal.inmemory && IS_JOURNAL(item_state)) { memcpy(buf, (uint8_t*)journal.buffer + offset, len); @@ -97,18 +93,21 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint32_t cur_start = item_start; if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset) { + uint64_t blk_begin = 0, blk_end = 0; + uint8_t *blk_buf = NULL; + auto & rv = PRIV(read_op)->read_vec; cur_start = cur_start < read_op->offset ? read_op->offset : cur_start; item_end = item_end > read_op->offset + read_op->len ? read_op->offset + read_op->len : item_end; - find_holes(PRIV(read_op)->read_vec, cur_start, item_end, [&](int pos, bool alloc, uint32_t start, uint32_t end) + find_holes(rv, cur_start, item_end, [&](int pos, bool alloc, uint32_t start, uint32_t end) { - if (alloc) + if (!r || alloc) return 0; copy_buffer_t el = { .copy_flags = (IS_JOURNAL(item_state) ? COPY_BUF_JOURNAL : COPY_BUF_DATA), .offset = start, .len = end-start, .disk_offset = item_location + el.offset - item_start, - .journal_sector = journal_sector, + .journal_sector = (IS_JOURNAL(item_state) ? journal_sector : 0), .csum_buf = !csum ? NULL : (csum + (cur_start - item_start) / dsk.csum_block_size * (dsk.data_csum_type & 0xFF)), }; if (IS_BIG_WRITE(item_state)) @@ -118,17 +117,27 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, // Very improbable, but possible PRIV(read_op)->clean_version_used = 1; } - PRIV(read_op)->read_vec.insert(PRIV(read_op)->read_vec.begin() + pos, el); + rv.insert(rv.begin() + pos, el); + if (!journal.inmemory && dsk.csum_block_size > dsk.bitmap_granularity && IS_JOURNAL(item_state) && !IS_DELETE(item_state)) + { + int pad_state = pad_journal_read(rv, rv[pos], item_start, item_end, item_location, + csum, start, end-start, blk_begin, blk_end, blk_buf); + if (pad_state == 2) + return 1; + else if (pad_state == 1) + { + // Submit the journal checksum block read + if (!read_checksum_block(read_op, 1, fulfilled, item_location)) + r = 0; + return 1; + } + } + fulfilled += el.len; if (!fulfill_read_push(read_op, (uint8_t*)read_op->buf + el.offset - read_op->offset, item_location + el.offset - item_start, el.len, item_state, item_version)) - { - PRIV(read_op)->read_vec.clear(); r = 0; - return 0; - } - fulfilled += el.len; return 1; }); } @@ -240,10 +249,11 @@ bool blockstore_impl_t::read_range_fulfilled(std::vector & rv, ui return all_done; } -bool blockstore_impl_t::read_clean_checksum_block(blockstore_op_t *op, int rv_pos, - uint64_t &fulfilled, uint64_t clean_loc, uint32_t item_start, uint32_t item_end) +bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc) { auto & rv = PRIV(op)->read_vec; + auto *vi = &rv[rv.size()-rv_pos]; + uint32_t item_start = vi->offset, item_end = vi->offset+vi->len; uint32_t fill_size = 0; int n_iov = 0; find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end) @@ -271,31 +281,32 @@ bool blockstore_impl_t::read_clean_checksum_block(blockstore_op_t *op, int rv_po .copy_flags = COPY_BUF_DATA, .offset = cur_start, .len = cur_end-cur_start, - .disk_offset = clean_loc, }); fulfilled += cur_end-cur_start; return 1; } return 0; }); + vi = &rv[rv.size()-rv_pos]; // Save buf into read_vec too but in a creepy way // FIXME: Shit, something else should be invented %) - rv[rv.size()-rv_pos] = (copy_buffer_t){ - .copy_flags = COPY_BUF_CSUM_FILL, + *vi = (copy_buffer_t){ + .copy_flags = vi->copy_flags, .offset = 0xffffffff, .len = ((uint64_t)n_iov << 32) | fill_size, .disk_offset = clean_loc + item_start, - .csum_buf = (uint8_t*)buf, + .buf = (uint8_t*)buf, + .csum_buf = vi->csum_buf, }; + int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd); + uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset); uint32_t d_pos = 0; for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX) { int n_cur = n_iov-n_pos < IOV_MAX ? n_iov-n_pos : IOV_MAX; BS_SUBMIT_GET_SQE(sqe, data); PRIV(op)->pending_ops++; - my_uring_prep_readv( - sqe, dsk.data_fd, iov + n_pos, n_cur, dsk.data_offset + clean_loc + d_pos - ); + my_uring_prep_readv(sqe, submit_fd, iov + n_pos, n_cur, submit_offset + clean_loc + d_pos); data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; if (n_pos > 0 || n_pos + IOV_MAX < n_iov) { @@ -308,14 +319,17 @@ bool blockstore_impl_t::read_clean_checksum_block(blockstore_op_t *op, int rv_po else data->iov.iov_len = item_end-item_start; } - // Reading may race with flushing. - // - Flushing happens in 3 steps: (2) punch holes in meta -> (4) update data -> (6) update meta - // - Reading may start/end at: 1/3, 1/5, 1/7, 3/5, 3/7, 5/7 - // - 1/3, 1/5, 3/5 are not a problem because we'll check data using punched bitmap and CRCs - // - For 1/7, 3/7 and 5/7 to finish correctly we need a copy of punched metadata - // otherwise the checksum may not match - // So flushers save a copy of punched metadata if the object is being read during (6). - PRIV(op)->clean_version_used = 1; + if (!(vi->copy_flags & COPY_BUF_JOURNAL)) + { + // Reading may race with flushing. + // - Flushing happens in 3 steps: (2) punch holes in meta -> (4) update data -> (6) update meta + // - Reading may start/end at: 1/3, 1/5, 1/7, 3/5, 3/7, 5/7 + // - 1/3, 1/5, 3/5 are not a problem because we'll check data using punched bitmap and CRCs + // - For 1/7, 3/7 and 5/7 to finish correctly we need a copy of punched metadata + // otherwise the checksum may not match + // So flushers save a copy of punched metadata if the object is being read during (6). + PRIV(op)->clean_version_used = 1; + } return true; } @@ -341,6 +355,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) uint64_t fulfilled = 0; PRIV(read_op)->pending_ops = 0; PRIV(read_op)->clean_version_used = 0; + auto & rv = PRIV(read_op)->read_vec; uint64_t result_version = 0; if (dirty_found) { @@ -369,15 +384,23 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) } } // If inmemory_journal is false, journal trim will have to wait until the read is completed - // FIXME: Verify checksums when reading from journal disk - if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len, - dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset), - (IS_JOURNAL(dirty.state) ? dirty.journal_sector+1 : 0), - journal.inmemory ? NULL : bmp_ptr+dsk.clean_entry_bitmap_size)) + if (!IS_JOURNAL(dirty.state)) { - // need to wait. undo added requests, don't dequeue op - PRIV(read_op)->read_vec.clear(); - return 0; + // Read from data disk, possibly checking checksums + if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dirty.location, dirty_it->first.version)) + { + goto undo_read; + } + } + else + { + // Copy from memory or read from journal, possibly checking checksums + if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len, + dirty.state, dirty_it->first.version, dirty.location, dirty.journal_sector+1, + journal.inmemory ? NULL : bmp_ptr+dsk.clean_entry_bitmap_size)) + { + goto undo_read; + } } } if (fulfilled == read_op->len || dirty_it == dirty_db.begin()) @@ -401,84 +424,9 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) if (fulfilled < read_op->len) { uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0); - if (!dsk.clean_entry_bitmap_size) + if (!fulfill_clean_read(read_op, fulfilled, clean_entry_bitmap, clean_it->second.location, clean_it->second.version)) { - if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, - (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location, 0, - clean_entry_bitmap + 2*dsk.clean_entry_bitmap_size)) - { - // need to wait. undo added requests, don't dequeue op - PRIV(read_op)->read_vec.clear(); - return 0; - } - } - else if (dsk.csum_block_size > dsk.bitmap_granularity) - { - auto & rv = PRIV(read_op)->read_vec; - int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, - (uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len); - for (int i = req; i > 0; i--) - { - auto & vi = rv[rv.size()-i]; - if (!read_clean_checksum_block(read_op, i, fulfilled, clean_it->second.location, vi.offset, vi.offset+vi.len)) - { - // need to wait. undo added requests, don't dequeue op - for (auto & vec: rv) - { - if (vec.copy_flags == COPY_BUF_CSUM_FILL && vec.csum_buf) - { - free(vec.csum_buf); - vec.csum_buf = NULL; - } - } - rv.clear(); - return 0; - } - } - } - else - { - uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity; - while (bmp_start < bmp_size) - { - while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size) - { - bmp_end++; - } - if (bmp_end > bmp_start) - { - // fill with zeroes - assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, - bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL)); - } - bmp_start = bmp_end; - while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size) - { - bmp_end++; - } - if (bmp_end > bmp_start) - { - uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + - 2*dsk.clean_entry_bitmap_size + - bmp_start*dsk.bitmap_granularity/dsk.csum_block_size*(dsk.data_csum_type & 0xFF)); - if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, - bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, - clean_it->second.location + bmp_start * dsk.bitmap_granularity, 0, csum)) - { - // need to wait. undo added requests, don't dequeue op - PRIV(read_op)->read_vec.clear(); - return 0; - } - bmp_start = bmp_end; - } - } - } - // Increment counter if clean data is being read from the disk - if (PRIV(read_op)->clean_version_used) - { - obj_ver_id ov = { .oid = read_op->oid, .version = clean_it->second.version }; - used_clean_objects[ov].refs++; - PRIV(read_op)->clean_version_used = ov.version; + goto undo_read; } } } @@ -490,11 +438,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) FINISH_OP(read_op); return 2; } - if (fulfilled < read_op->len) - { - assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL)); - assert(fulfilled == read_op->len); - } + assert(fulfilled == read_op->len); read_op->version = result_version; if (!PRIV(read_op)->pending_ops) { @@ -519,6 +463,127 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) } read_op->retval = 0; return 2; +undo_read: + // need to wait. undo added requests, don't dequeue op + if (dsk.csum_block_size > dsk.bitmap_granularity) + { + for (auto & vec: rv) + { + if ((vec.copy_flags & COPY_BUF_CSUM_FILL) && vec.buf) + { + free(vec.buf); + vec.buf = NULL; + } + } + } + rv.clear(); + return 0; +} + +int blockstore_impl_t::pad_journal_read(std::vector & rv, copy_buffer_t & cp, + // FIXME Passing dirty_entry& would be nicer + uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, + uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf) +{ + if (offset % dsk.csum_block_size || submit_len % dsk.csum_block_size) + { + if (offset < blk_end) + { + // Already being read as a part of the previous checksum block series + cp.buf = blk_buf + offset - blk_begin; + cp.copy_flags |= COPY_BUF_COALESCED; + if (offset+submit_len > blk_end) + cp.len = blk_end-offset; + return 2; + } + else + { + // We don't use fill_partial_checksum_blocks for journal because journal writes never have holes (internal bitmap) + blk_begin = (offset/dsk.csum_block_size) * dsk.csum_block_size; + blk_begin = blk_begin < dirty_offset ? dirty_offset : blk_begin; + blk_end = ((offset+submit_len-1)/dsk.csum_block_size + 1) * dsk.csum_block_size; + blk_end = blk_end > dirty_end ? dirty_end : blk_end; + if (blk_begin < offset || blk_end > offset+submit_len) + { + blk_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, blk_end-blk_begin); + cp.buf = blk_buf + offset - blk_begin; + cp.copy_flags |= COPY_BUF_COALESCED; + rv.push_back((copy_buffer_t){ + .copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL, + .offset = blk_begin, + .len = blk_end-blk_begin, + .disk_offset = dirty_loc + blk_begin - dirty_offset, + .buf = blk_buf, + .csum_buf = (csum_ptr + (blk_begin/dsk.csum_block_size - + dirty_offset/dsk.csum_block_size) * (dsk.data_csum_type & 0xFF)), + }); + return 1; + } + } + } + return 0; +} + +bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled, + uint8_t *clean_entry_bitmap, uint64_t clean_loc, uint64_t clean_ver) +{ + if (dsk.csum_block_size > dsk.bitmap_granularity) + { + auto & rv = PRIV(read_op)->read_vec; + int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, + (uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len); + for (int i = req; i > 0; i--) + { + if (!read_checksum_block(read_op, i, fulfilled, clean_loc)) + { + return false; + } + } + PRIV(read_op)->clean_version_used = req > 0; + } + else + { + uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity; + while (bmp_start < bmp_size) + { + while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size) + { + bmp_end++; + } + if (bmp_end > bmp_start) + { + // fill with zeroes + assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, + bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL)); + } + bmp_start = bmp_end; + while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size) + { + bmp_end++; + } + if (bmp_end > bmp_start) + { + uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + + 2*dsk.clean_entry_bitmap_size + + bmp_start*dsk.bitmap_granularity/dsk.csum_block_size*(dsk.data_csum_type & 0xFF)); + if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, + bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, + clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum)) + { + return false; + } + bmp_start = bmp_end; + } + } + } + // Increment reference counter if clean data is being read from the disk + if (PRIV(read_op)->clean_version_used) + { + obj_ver_id ov = { .oid = read_op->oid, .version = clean_ver }; + used_clean_objects[ov].refs++; + PRIV(read_op)->clean_version_used = ov.version; + } + return true; } bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint32_t offset, @@ -646,13 +711,16 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op auto & rv = PRIV(op)->read_vec; if (dsk.csum_block_size > dsk.bitmap_granularity) { - for (int i = rv.size()-1; i >= 0 && rv[i].copy_flags == COPY_BUF_CSUM_FILL; i--) + for (int i = rv.size()-1; i >= 0 && (rv[i].copy_flags & COPY_BUF_CSUM_FILL); i--) { - struct iovec *iov = (struct iovec*)(rv[i].csum_buf + (rv[i].len & 0xFFFFFFFF)); - if (!verify_read_padded_checksums(op, rv[i].disk_offset, iov, rv[i].len >> 32)) + struct iovec *iov = (struct iovec*)(rv[i].buf + (rv[i].len & 0xFFFFFFFF)); + int n_iov = rv[i].len >> 32; + if (!((rv[i].copy_flags & COPY_BUF_JOURNAL) + ? verify_journal_checksums(rv[i].csum_buf, rv[i].disk_offset % dsk.data_block_size, iov, n_iov, NULL) + : verify_read_padded_checksums(op, rv[i].disk_offset, iov, n_iov))) op->retval = -EDOM; - free(rv[i].csum_buf); - rv[i].csum_buf = NULL; + free(rv[i].buf); + rv[i].buf = NULL; } } else diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 2327be72..1fc019db 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -194,9 +194,12 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) data_csums[0] = fn(0, op->buf, op->len, op->offset - start*dsk.csum_block_size, end*dsk.csum_block_size - (op->offset+op->len)); else { + // First block data_csums[0] = fn(0, op->buf, dsk.csum_block_size*(start+1)-op->offset, op->offset - start*dsk.csum_block_size, 0); + // Intermediate blocks for (uint32_t i = start+1; i < end; i++) data_csums[i-start] = crc32c(0, (uint8_t*)op->buf + dsk.csum_block_size*i-op->offset, dsk.csum_block_size); + // Last block data_csums[end-start] = fn( 0, (uint8_t*)op->buf + end*dsk.csum_block_size - op->offset, op->offset+op->len - end*dsk.csum_block_size, diff --git a/src/disk_tool_journal.cpp b/src/disk_tool_journal.cpp index 42c1cbda..d997b0b8 100644 --- a/src/disk_tool_journal.cpp +++ b/src/disk_tool_journal.cpp @@ -199,16 +199,44 @@ int disk_tool_t::process_journal_block(void *buf, std::functionsmall_write.len, dsk.journal_offset+je->small_write.data_offset) == je->small_write.len); data_crc32 = je_start.csum_block_size ? 0 : crc32c(0, small_write_data, je->small_write.len); data_csum_valid = (data_crc32 == je->small_write.crc32_data); - if (je_start.csum_block_size) + if (je_start.csum_block_size && je->small_write.len > 0) { - uint32_t data_csum_size = je->small_write.len/je_start.csum_block_size*(je_start.data_csum_type & 0xFF); - uint32_t *block_csums = (uint32_t*)((uint8_t*)je + je->size - data_csum_size); - for (uint32_t pos = 0; pos < je->small_write.len; pos += je_start.csum_block_size, block_csums++) + // like in enqueue_write() + uint32_t start = je->small_write.offset / je_start.csum_block_size; + uint32_t end = (je->small_write.offset+je->small_write.len-1) / je_start.csum_block_size; + uint32_t data_csum_size = (end-start+1) * (je_start.data_csum_type & 0xFF); + if (je->size < sizeof(journal_entry_small_write) + data_csum_size) { - if (crc32c(0, (uint8_t*)small_write_data + pos, je_start.csum_block_size) != *block_csums) + data_csum_valid = false; + } + else + { + uint32_t calc_csum = 0; + uint32_t *block_csums = (uint32_t*)((uint8_t*)je + je->size - data_csum_size); + if (start == end) { - data_csum_valid = false; - break; + calc_csum = crc32c(0, (uint8_t*)small_write_data, je->small_write.len); + data_csum_valid = data_csum_valid && (calc_csum == *block_csums++); + } + else + { + // First block + calc_csum = crc32c(0, (uint8_t*)small_write_data, + je_start.csum_block_size*(start+1)-je->small_write.offset); + data_csum_valid = data_csum_valid && (calc_csum == *block_csums++); + // Intermediate blocks + for (uint32_t i = start+1; i < end; i++) + { + calc_csum = crc32c(0, (uint8_t*)small_write_data + + je_start.csum_block_size*i-je->small_write.offset, je_start.csum_block_size); + data_csum_valid = data_csum_valid && (calc_csum == *block_csums++); + } + // Last block + calc_csum = crc32c( + 0, (uint8_t*)small_write_data + end*je_start.csum_block_size - je->small_write.offset, + je->small_write.offset+je->small_write.len - end*je_start.csum_block_size + ); + data_csum_valid = data_csum_valid && (calc_csum == *block_csums++); } } } @@ -265,20 +293,22 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json) } else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT) { + auto & sw = je->small_write; printf( json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\"" : "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx", je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "", - je->small_write.oid.inode, je->small_write.oid.stripe, - je->small_write.version, je->small_write.offset, je->small_write.len, - je->small_write.data_offset + sw.oid.inode, sw.oid.stripe, sw.version, sw.offset, sw.len, sw.data_offset ); - if (journal_calc_data_pos != je->small_write.data_offset) + if (journal_calc_data_pos != sw.data_offset) { printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\"" : " (mismatched, calculated = %lu)", journal_pos); } - uint32_t data_csum_size = (!je_start.csum_block_size ? 0 : je->small_write.len/je_start.csum_block_size*(je_start.data_csum_type & 0xFF)); + uint32_t data_csum_size = (!je_start.csum_block_size + ? 0 + : ((sw.offset + sw.len - 1)/je_start.csum_block_size - sw.offset/je_start.csum_block_size + 1) + *(je_start.data_csum_type & 0xFF)); if (je->size > sizeof(journal_entry_small_write) + data_csum_size) { printf(json ? ",\"bitmap\":\"" : " (bitmap: "); @@ -291,13 +321,13 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json) if (dump_with_data) { printf(json ? ",\"data\":\"" : " (data: "); - for (int i = 0; i < je->small_write.len; i++) + for (int i = 0; i < sw.len; i++) { printf("%02x", ((uint8_t*)small_write_data)[i]); } printf(json ? "\"" : ")"); } - if (data_csum_size > 0) + if (data_csum_size > 0 && je->size >= sizeof(journal_entry_small_write) + data_csum_size) { printf(json ? ",\"block_csums\":\"" : " block_csums="); uint8_t *block_csums = (uint8_t*)je + je->size - data_csum_size; @@ -307,27 +337,29 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json) } else { - printf(json ? ",\"data_crc32\":\"%08x\"" : " data_crc32=%08x", je->small_write.crc32_data); + printf(json ? ",\"data_crc32\":\"%08x\"" : " data_crc32=%08x", sw.crc32_data); } printf( json ? ",\"data_valid\":%s}" : "%s\n", (data_csum_valid - ? (json ? "false" : " (invalid)") - : (json ? "true" : " (valid)")) + ? (json ? "true" : " (valid)") + : (json ? "false" : " (invalid)")) ); } else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT) { + auto & bw = je->big_write; printf( json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\"" : "je_big_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx", je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "", - je->big_write.oid.inode, je->big_write.oid.stripe, - je->big_write.version, je->big_write.offset, je->big_write.len, - je->big_write.location + bw.oid.inode, bw.oid.stripe, bw.version, bw.offset, bw.len, bw.location ); - uint32_t data_csum_size = (!je_start.csum_block_size ? 0 : je->big_write.len/je_start.csum_block_size*(je_start.data_csum_type & 0xFF)); - if (data_csum_size > 0) + uint32_t data_csum_size = (!je_start.csum_block_size + ? 0 + : ((bw.offset + bw.len - 1)/je_start.csum_block_size - bw.offset/je_start.csum_block_size + 1) + *(je_start.data_csum_type & 0xFF)); + if (data_csum_size > 0 && je->size >= sizeof(journal_entry_big_write) + data_csum_size) { printf(json ? ",\"block_csums\":\"" : " block_csums="); uint8_t *block_csums = (uint8_t*)je + je->size - data_csum_size; @@ -335,10 +367,10 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json) printf("%02x", block_csums[i]); printf(json ? "\"" : ""); } - if (je->big_write.size > sizeof(journal_entry_big_write) + data_csum_size) + if (bw.size > sizeof(journal_entry_big_write) + data_csum_size) { printf(json ? ",\"bitmap\":\"" : " (bitmap: "); - for (int i = sizeof(journal_entry_big_write); i < je->big_write.size - data_csum_size; i++) + for (int i = sizeof(journal_entry_big_write); i < bw.size - data_csum_size; i++) { printf("%02x", ((uint8_t*)je)[i]); } diff --git a/src/disk_tool_meta.cpp b/src/disk_tool_meta.cpp index 39e18587..efaef0b8 100644 --- a/src/disk_tool_meta.cpp +++ b/src/disk_tool_meta.cpp @@ -33,14 +33,14 @@ int disk_tool_t::process_meta(std::function { if (hdr->version == BLOCKSTORE_META_VERSION_V1) { - // Vitastor 0.6-0.7 - static array of clean_disk_entry with bitmaps + // Vitastor 0.6-0.8 - static array of clean_disk_entry with bitmaps hdr->data_csum_type = 0; hdr->csum_block_size = 0; hdr->header_csum = 0; } else if (hdr->version == BLOCKSTORE_META_VERSION_V2) { - // Vitastor 0.8 - static array of clean_disk_entry with bitmaps and checksums + // Vitastor 0.9 - static array of clean_disk_entry with bitmaps and checksums if (hdr->data_csum_type != 0 && hdr->data_csum_type != BLOCKSTORE_CSUM_CRC32C) { @@ -169,7 +169,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr) else if (hdr->version == BLOCKSTORE_META_VERSION_V2) { printf( - "{\"version\":\"0.8\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u," + "{\"version\":\"0.9\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u," "\"data_csum_type\":%s,\"csum_block_size\":%u,\"entries\":[\n", hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity, csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size diff --git a/tests/test_heal.sh b/tests/test_heal.sh index 559afbcd..a9de5361 100755 --- a/tests/test_heal.sh +++ b/tests/test_heal.sh @@ -49,7 +49,7 @@ kill_osds() kill_osds & LD_PRELOAD="build/src/libfio_vitastor.so" \ - fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bsrange=4k-128k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bsrange=4k-128k -blockalign=4k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \ -randrepeat=0 -refill_buffers=1 -mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 qemu-img convert -S 4096 -p \ diff --git a/tests/test_heal_csum.sh b/tests/test_heal_csum.sh new file mode 100755 index 00000000..dbd95ecd --- /dev/null +++ b/tests/test_heal_csum.sh @@ -0,0 +1 @@ +OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh