From cf819eb442940173705fb7bc9d04767efb331054 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 12 Jan 2020 02:11:09 +0300 Subject: [PATCH] Implement sparse block bitmap to avoid zero-fill --- blockstore_flush.cpp | 264 +++++++++++++++++++++++++++---------------- blockstore_flush.h | 25 +++- blockstore_impl.cpp | 2 + blockstore_impl.h | 20 +++- blockstore_init.cpp | 26 +++-- blockstore_init.h | 2 +- blockstore_open.cpp | 24 +++- blockstore_read.cpp | 56 +++++++-- blockstore_write.cpp | 26 ++--- 9 files changed, 298 insertions(+), 147 deletions(-) diff --git a/blockstore_flush.cpp b/blockstore_flush.cpp index 0a1c72661..fffdaf4b8 100644 --- a/blockstore_flush.cpp +++ b/blockstore_flush.cpp @@ -188,85 +188,13 @@ bool journal_flusher_co::loop() #ifdef BLOCKSTORE_DEBUG printf("Flushing %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version); #endif - dirty_it = dirty_end; flusher->active_flushers++; - v.clear(); - wait_count = 0; - copy_count = 0; - clean_loc = UINT64_MAX; - has_delete = false; - skip_copy = false; - while (1) +resume_1: + // Scan dirty versions of the object + if (!scan_dirty(1)) { - if (dirty_it->second.state == ST_J_STABLE && !skip_copy) - { - // First we submit all reads - offset = dirty_it->second.offset; - end_offset = dirty_it->second.offset + dirty_it->second.len; - it = v.begin(); - while (1) - { - for (; it != v.end(); it++) - if (it->offset >= offset) - break; - if (it == v.end() || it->offset > offset && it->len > 0) - { - submit_offset = dirty_it->second.location + offset - dirty_it->second.offset; - submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset; - it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) }); - copy_count++; - if (bs->journal.inmemory) - { - // Take it from memory - memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len); - } - else - { - // Read it from disk - await_sqe(1); - data->iov = (struct iovec){ v.back().buf, (size_t)submit_len }; - data->callback = simple_callback_r; - my_uring_prep_readv( - sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset - ); - wait_count++; - } - } - offset = it->offset+it->len; - if (it == v.end() || offset >= end_offset) - break; - } - } - else if (dirty_it->second.state == ST_D_STABLE && !skip_copy) - { - // There is an unflushed big write. Copy small writes in its position - clean_loc = dirty_it->second.location; - skip_copy = true; - } - else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy) - { - // There is an unflushed delete - has_delete = true; - skip_copy = true; - } - else if (!IS_STABLE(dirty_it->second.state)) - { - char err[1024]; - snprintf( - err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d", - dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state - ); - throw std::runtime_error(err); - } - if (dirty_it == bs->dirty_db.begin()) - { - break; - } - dirty_it--; - if (dirty_it->first.oid != cur.oid) - { - break; - } + wait_state += 1; + return false; } if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete) { @@ -283,16 +211,13 @@ bool journal_flusher_co::loop() return true; } // Find it in clean_db - { - auto clean_it = bs->clean_db.find(cur.oid); - old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX); - old_clean_ver = (clean_it != bs->clean_db.end() ? clean_it->second.version : 0); - } + clean_it = bs->clean_db.find(cur.oid); + old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX); if (clean_loc == UINT64_MAX) { if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX) { - // Object not present at all. This is a bug. + // Object not allocated. This is a bug. char err[1024]; snprintf( err, 1024, "BUG: Object %lu:%lu v%lu that we are trying to flush is not allocated on the data device", @@ -301,10 +226,10 @@ bool journal_flusher_co::loop() throw std::runtime_error(err); } else + { clean_loc = old_clean_loc; + } } - else - has_delete = false; // Also we need to submit metadata read(s). We do read-modify-write cycle(s) for every operation. resume_2: if (!modify_meta_read(clean_loc, meta_new, 2)) @@ -339,9 +264,24 @@ bool journal_flusher_co::loop() meta_old.it->second.state = 1; bs->ringloop->wakeup(); } - // Reads completed, submit writes + // Reads completed, submit writes and set bitmap bits + if (bs->clean_entry_bitmap_size) + { + new_clean_bitmap = (bs->inmemory_meta + ? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry) + : bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size); + if (clean_init_bitmap) + { + memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size); + bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len); + } + } for (it = v.begin(); it != v.end(); it++) { + if (new_clean_bitmap) + { + bitmap_set(new_clean_bitmap, it->offset, it->len); + } await_sqe(4); data->iov = (struct iovec){ it->buf, (size_t)it->len }; data->callback = simple_callback_w; @@ -374,7 +314,7 @@ bool journal_flusher_co::loop() wait_state = 5; return false; } - ((clean_disk_entry*)meta_old.buf)[meta_old.pos] = { 0 }; + memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size); await_sqe(15); data->iov = (struct iovec){ meta_old.buf, META_BLOCK_SIZE }; data->callback = simple_callback_w; @@ -383,12 +323,20 @@ bool journal_flusher_co::loop() ); wait_count++; } - ((clean_disk_entry*)meta_new.buf)[meta_new.pos] = has_delete - ? (clean_disk_entry){ 0 } - : (clean_disk_entry){ - .oid = cur.oid, - .version = cur.version, - }; + if (has_delete) + { + memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size); + } + else + { + clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size); + new_entry->oid = cur.oid; + new_entry->version = cur.version; + if (!bs->inmemory_meta) + { + memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size); + } + } await_sqe(6); data->iov = (struct iovec){ meta_new.buf, META_BLOCK_SIZE }; data->callback = simple_callback_w; @@ -484,15 +432,109 @@ bool journal_flusher_co::loop() return true; } +bool journal_flusher_co::scan_dirty(int wait_base) +{ + if (wait_state == wait_base) + { + goto resume_0; + } + dirty_it = dirty_end; + v.clear(); + wait_count = 0; + copy_count = 0; + clean_loc = UINT64_MAX; + has_delete = false; + skip_copy = false; + clean_init_bitmap = false; + while (1) + { + if (dirty_it->second.state == ST_J_STABLE && !skip_copy) + { + // First we submit all reads + offset = dirty_it->second.offset; + end_offset = dirty_it->second.offset + dirty_it->second.len; + it = v.begin(); + while (1) + { + for (; it != v.end(); it++) + if (it->offset >= offset) + break; + if (it == v.end() || it->offset > offset && it->len > 0) + { + submit_offset = dirty_it->second.location + offset - dirty_it->second.offset; + submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset; + it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) }); + copy_count++; + if (bs->journal.inmemory) + { + // Take it from memory + memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len); + } + else + { + // Read it from disk + await_sqe(0); + data->iov = (struct iovec){ v.back().buf, (size_t)submit_len }; + data->callback = simple_callback_r; + my_uring_prep_readv( + sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset + ); + wait_count++; + } + } + offset = it->offset+it->len; + if (it == v.end() || offset >= end_offset) + break; + } + } + else if (dirty_it->second.state == ST_D_STABLE && !skip_copy) + { + // There is an unflushed big write. Copy small writes in its position + clean_loc = dirty_it->second.location; + clean_init_bitmap = true; + clean_bitmap_offset = dirty_it->second.offset; + clean_bitmap_len = dirty_it->second.len; + skip_copy = true; + } + else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy) + { + // There is an unflushed delete + has_delete = true; + skip_copy = true; + } + else if (!IS_STABLE(dirty_it->second.state)) + { + char err[1024]; + snprintf( + err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d", + dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state + ); + throw std::runtime_error(err); + } + if (dirty_it == bs->dirty_db.begin()) + { + break; + } + dirty_it--; + if (dirty_it->first.oid != cur.oid) + { + break; + } + } + return true; +} + bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base) { if (wait_state == wait_base) + { goto resume_0; + } // We must check if the same sector is already in memory if we don't keep all metadata in memory all the time. // And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot, // so I'll avoid it as long as I can. - wr.sector = ((meta_loc >> bs->block_order) / (META_BLOCK_SIZE / sizeof(clean_disk_entry))) * META_BLOCK_SIZE; - wr.pos = ((meta_loc >> bs->block_order) % (META_BLOCK_SIZE / sizeof(clean_disk_entry))); + wr.sector = ((meta_loc >> bs->block_order) / (META_BLOCK_SIZE / bs->clean_entry_size)) * META_BLOCK_SIZE; + wr.pos = ((meta_loc >> bs->block_order) % (META_BLOCK_SIZE / bs->clean_entry_size)); if (bs->inmemory_meta) { wr.buf = bs->metadata_buffer + wr.sector; @@ -643,3 +685,35 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base) } return true; } + +void journal_flusher_co::bitmap_set(void *bitmap, uint64_t start, uint64_t len) +{ + if (start == 0) + { + if (len == 32*BITMAP_GRANULARITY) + { + *((uint32_t*)bitmap) = 1; + return; + } + else if (len == 64*BITMAP_GRANULARITY) + { + *((uint64_t*)bitmap) = 1; + return; + } + } + unsigned bit_start = start / BITMAP_GRANULARITY; + unsigned bit_end = ((start + len) + BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; + while (bit_start < bit_end) + { + if (!(bit_start & 7) && bit_end >= bit_start+8) + { + ((uint8_t*)bitmap)[bit_start / 8] = 1; + bit_start += 8; + } + else + { + ((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8); + bit_start++; + } + } +} diff --git a/blockstore_flush.h b/blockstore_flush.h index 7ac33ec94..dfa514ba6 100644 --- a/blockstore_flush.h +++ b/blockstore_flush.h @@ -37,21 +37,34 @@ class journal_flusher_co int wait_state, wait_count; struct io_uring_sqe *sqe; struct ring_data_t *data; - bool skip_copy, has_delete; + + std::list::iterator cur_sync; + obj_ver_id cur; - std::map::iterator dirty_it, dirty_start, dirty_end; + std::map::iterator dirty_it, dirty_end; + std::map::iterator repeat_it; + std::function simple_callback_r, simple_callback_w; + + bool skip_copy, has_delete; + spp::sparse_hash_map::iterator clean_it; std::vector v; std::vector::iterator it; int copy_count; - uint64_t offset, end_offset, submit_offset, submit_len, clean_loc, old_clean_loc, old_clean_ver; + uint64_t clean_loc, old_clean_loc; flusher_meta_write_t meta_old, meta_new; - std::map::iterator repeat_it; - std::function simple_callback_r, simple_callback_w; - std::list::iterator cur_sync; + bool clean_init_bitmap; + uint64_t clean_bitmap_offset, clean_bitmap_len; + void *new_clean_bitmap; + + // local: scan_dirty() + uint64_t offset, end_offset, submit_offset, submit_len; + friend class journal_flusher_t; + bool scan_dirty(int wait_base); bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base); void update_clean_db(); bool fsync_batch(bool fsync_meta, int wait_base); + void bitmap_set(void *bitmap, uint64_t start, uint64_t len); public: journal_flusher_co(); bool loop(); diff --git a/blockstore_impl.cpp b/blockstore_impl.cpp index e1d994289..afa032ffd 100644 --- a/blockstore_impl.cpp +++ b/blockstore_impl.cpp @@ -57,6 +57,8 @@ blockstore_impl_t::~blockstore_impl_t() close(journal.fd); if (metadata_buffer) free(metadata_buffer); + if (clean_bitmap) + free(clean_bitmap); } bool blockstore_impl_t::is_started() diff --git a/blockstore_impl.h b/blockstore_impl.h index ce9367c1c..a9072e17d 100644 --- a/blockstore_impl.h +++ b/blockstore_impl.h @@ -23,10 +23,19 @@ //#define BLOCKSTORE_DEBUG -#define DISK_ALIGNMENT 512 +// Memory alignment for direct I/O (usually 512 bytes) +// All other alignments must be a multiple of this one #define MEM_ALIGNMENT 512 +// FIXME: Make following constants configurable +// Required write alignment and journal/metadata/data areas' location alignment +#define DISK_ALIGNMENT 512 +// Journal block size - minimum_io_size of the journal device is the best choice #define JOURNAL_BLOCK_SIZE 512 +// Metadata block size - minimum_io_size of the metadata device is the best choice #define META_BLOCK_SIZE 512 +// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple +// of the write alignment. +#define BITMAP_GRANULARITY 4096 // States are not stored on disk. Instead, they're deduced from the journal @@ -83,12 +92,13 @@ #include "blockstore_journal.h" -// 24 bytes per "clean" entry on disk with fixed metadata tables +// 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables // FIXME: maybe add crc32's to metadata struct __attribute__((__packed__)) clean_disk_entry { object_id oid; uint64_t version; + uint8_t bitmap[]; }; // 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry) @@ -177,6 +187,7 @@ class blockstore_impl_t // Another option is https://github.com/algorithm-ninja/cpp-btree spp::sparse_hash_map clean_db; + uint8_t *clean_bitmap = NULL; std::map dirty_db; std::list submit_queue; // FIXME: funny thing is that vector is better here std::vector unsynced_big_writes, unsynced_small_writes; @@ -186,6 +197,7 @@ class blockstore_impl_t uint64_t block_count; uint32_t block_order, block_size; + uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0; int meta_fd; int data_fd; @@ -197,10 +209,6 @@ class blockstore_impl_t // FIXME: separate flags for data, metadata and journal // It is safe to disable fsync() if drive write cache is writethrough bool disable_fsync = false; - // It is safe to disable zero fill if drive is zeroed before formatting. - // For example, with TRIM and Deterministic Read Zeroes after TRIM. - // FIXME: OP_DELETE should trim/zero out the block. - bool zerofill_enabled = false; bool inmemory_meta = false; void *metadata_buffer = NULL; diff --git a/blockstore_init.cpp b/blockstore_init.cpp index 059169740..a369b905d 100644 --- a/blockstore_init.cpp +++ b/blockstore_init.cpp @@ -65,12 +65,11 @@ int blockstore_init_meta::loop() void *done_buf = bs->inmemory_meta ? (metadata_buffer + done_pos) : (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0)); - unsigned count = META_BLOCK_SIZE / sizeof(clean_disk_entry); + unsigned count = META_BLOCK_SIZE / bs->clean_entry_size; for (int sector = 0; sector < done_len; sector += META_BLOCK_SIZE) { - clean_disk_entry *entries = (clean_disk_entry*)(done_buf + sector); // handle entries - handle_entries(entries, count, bs->block_order); + handle_entries(done_buf + sector, count, bs->block_order); done_cnt += count; } prev_done = 0; @@ -91,14 +90,19 @@ int blockstore_init_meta::loop() return 0; } -void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order) +void blockstore_init_meta::handle_entries(void* entries, unsigned count, int block_order) { for (unsigned i = 0; i < count; i++) { - if (entries[i].oid.inode > 0) + clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size); + if (!bs->inmemory_meta && bs->clean_entry_bitmap_size) { - auto clean_it = bs->clean_db.find(entries[i].oid); - if (clean_it == bs->clean_db.end() || clean_it->second.version < entries[i].version) + memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size); + } + if (entry->oid.inode > 0) + { + auto clean_it = bs->clean_db.find(entry->oid); + if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version) { if (clean_it != bs->clean_db.end()) { @@ -110,18 +114,18 @@ void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, unsi } entries_loaded++; #ifdef BLOCKSTORE_DEBUG - printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version); + printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version); #endif bs->data_alloc->set(done_cnt+i, true); - bs->clean_db[entries[i].oid] = (struct clean_entry){ - .version = entries[i].version, + bs->clean_db[entry->oid] = (struct clean_entry){ + .version = entry->version, .location = (done_cnt+i) << block_order, }; } else { #ifdef BLOCKSTORE_DEBUG - printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version); + printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version); #endif } } diff --git a/blockstore_init.h b/blockstore_init.h index e83c81a26..d6c880191 100644 --- a/blockstore_init.h +++ b/blockstore_init.h @@ -11,7 +11,7 @@ class blockstore_init_meta uint64_t entries_loaded = 0; struct io_uring_sqe *sqe; struct ring_data_t *data; - void handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order); + void handle_entries(void *entries, unsigned count, int block_order); void handle_event(ring_data_t *data); public: blockstore_init_meta(blockstore_impl_t *bs); diff --git a/blockstore_open.cpp b/blockstore_open.cpp index 4741ec8fc..893ebdfa0 100644 --- a/blockstore_open.cpp +++ b/blockstore_open.cpp @@ -10,10 +10,6 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config) { disable_fsync = true; } - if (config["zerofill"] == "true" || config["zerofill"] == "1" || config["zerofill"] == "yes") - { - zerofill_enabled = true; - } // data data_len = data_size - data_offset; if (data_fd == meta_fd && data_offset < meta_offset) @@ -48,8 +44,18 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config) ? journal.len : meta_offset-journal.offset; } // required metadata size + if (BITMAP_GRANULARITY % DISK_ALIGNMENT) + { + throw std::runtime_error("Sparse write tracking granularity must be a multiple of write alignment"); + } + if (block_size % BITMAP_GRANULARITY) + { + throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity"); + } + clean_entry_bitmap_size = block_size / BITMAP_GRANULARITY / 8; + clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size; block_count = data_len / block_size; - meta_len = ((block_count - 1 + META_BLOCK_SIZE / sizeof(clean_disk_entry)) / (META_BLOCK_SIZE / sizeof(clean_disk_entry))) * META_BLOCK_SIZE; + meta_len = ((block_count - 1 + META_BLOCK_SIZE / clean_entry_size) / (META_BLOCK_SIZE / clean_entry_size)) * META_BLOCK_SIZE; if (meta_area < meta_len) { throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes"); @@ -64,7 +70,13 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config) { metadata_buffer = memalign(MEM_ALIGNMENT, meta_len); if (!metadata_buffer) - throw std::runtime_error("Failed to allocate memory for metadata"); + throw std::runtime_error("Failed to allocate memory for the metadata"); + } + else if (clean_entry_bitmap_size) + { + clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size); + if (!clean_bitmap) + throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap"); } // requested journal size uint64_t journal_wanted = strtoull(config["journal_size"].c_str(), NULL, 10); diff --git a/blockstore_read.cpp b/blockstore_read.cpp index d53c6c384..e749b422b 100644 --- a/blockstore_read.cpp +++ b/blockstore_read.cpp @@ -60,11 +60,11 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille .len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start, }; it = PRIV(read_op)->read_vec.insert(it, el); - fulfilled += el.len; if (!fulfill_read_push(read_op, read_op->buf + el.offset - read_op->offset, item_location + el.offset - item_start, el.len, item_state, item_version)) { return 0; } + fulfilled += el.len; } cur_start = it->offset + it->len; if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end) @@ -97,7 +97,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) PRIV(read_op)->pending_ops = 0; if (dirty_found) { - while (dirty_it->first.oid == read_op->oid) + while (dirty_it->first.oid == read_op->oid && fulfilled < read_op->len) { dirty_entry& dirty = dirty_it->second; bool version_ok = read_op->version >= dirty_it->first.version; @@ -124,13 +124,55 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) dirty_it--; } } - if (clean_it != clean_db.end()) + if (clean_it != clean_db.end() && fulfilled < read_op->len) { - if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location)) + if (!clean_entry_bitmap_size) { - // need to wait. undo added requests, don't dequeue op - PRIV(read_op)->read_vec.clear(); - return 0; + if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location)) + { + // need to wait. undo added requests, don't dequeue op + PRIV(read_op)->read_vec.clear(); + return 0; + } + } + else + { + uint64_t meta_loc = clean_it->second.location >> block_order; + uint8_t *clean_entry_bitmap; + if (inmemory_meta) + { + uint64_t sector = (meta_loc / (META_BLOCK_SIZE / clean_entry_size)) * META_BLOCK_SIZE; + uint64_t pos = (meta_loc % (META_BLOCK_SIZE / clean_entry_size)); + clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry)); + } + else + { + clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size); + } + uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/BITMAP_GRANULARITY; + while (bmp_start < bmp_size) + { + while (!(clean_entry_bitmap[bmp_start >> 3] & (1 << (bmp_start & 0x7))) && bmp_start < bmp_size) + { + bmp_start++; + } + bmp_end = bmp_start; + while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size) + { + bmp_end++; + } + if (bmp_end > bmp_start) + { + if (!fulfill_read(read_op, fulfilled, bmp_start * BITMAP_GRANULARITY, + (bmp_end - bmp_start) * BITMAP_GRANULARITY, ST_CURRENT, 0, clean_it->second.location + bmp_start * BITMAP_GRANULARITY)) + { + // need to wait. undo added requests, don't dequeue op + PRIV(read_op)->read_vec.clear(); + return 0; + } + bmp_start = bmp_end; + } + } } } if (!PRIV(read_op)->pending_ops) diff --git a/blockstore_write.cpp b/blockstore_write.cpp index c14e85a66..2acdfcf19 100644 --- a/blockstore_write.cpp +++ b/blockstore_write.cpp @@ -106,28 +106,24 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) printf("Allocate block %lu\n", loc); #endif data_alloc->set(loc, true); + uint64_t stripe_offset = (op->offset % BITMAP_GRANULARITY); + uint64_t stripe_end = (op->offset + op->len) % BITMAP_GRANULARITY; + // Zero fill up to BITMAP_GRANULARITY int vcnt = 0; - uint64_t stripe_offset = 0; - if (op->len != block_size && zerofill_enabled) + if (stripe_offset) { - // Zero fill newly allocated object if required - if (op->offset > 0) - PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, op->offset }; - PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len }; - if (op->offset+op->len < block_size) - PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, block_size - (op->offset + op->len) }; - data->iov.iov_len = block_size; + PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset }; } - else + PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len }; + if (stripe_end) { - vcnt = 1; - PRIV(op)->iov_zerofill[0] = (struct iovec){ op->buf, op->len }; - data->iov.iov_len = op->len; // to check it in the callback - stripe_offset = op->offset; + stripe_end = BITMAP_GRANULARITY - stripe_end; + PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end }; } + data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; my_uring_prep_writev( - sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + stripe_offset + sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset ); PRIV(op)->pending_ops = 1; PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;