Implement sparse block bitmap to avoid zero-fill

blocking-uring-test
Vitaliy Filippov 2020-01-12 02:11:09 +03:00
parent 4b05bde3a2
commit cf819eb442
9 changed files with 298 additions and 147 deletions

View File

@ -188,85 +188,13 @@ bool journal_flusher_co::loop()
#ifdef BLOCKSTORE_DEBUG
printf("Flushing %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif
dirty_it = dirty_end;
flusher->active_flushers++;
v.clear();
wait_count = 0;
copy_count = 0;
clean_loc = UINT64_MAX;
has_delete = false;
skip_copy = false;
while (1)
resume_1:
// Scan dirty versions of the object
if (!scan_dirty(1))
{
if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
{
// First we submit all reads
offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len;
it = v.begin();
while (1)
{
for (; it != v.end(); it++)
if (it->offset >= offset)
break;
if (it == v.end() || it->offset > offset && it->len > 0)
{
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) });
copy_count++;
if (bs->journal.inmemory)
{
// Take it from memory
memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len);
}
else
{
// Read it from disk
await_sqe(1);
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
data->callback = simple_callback_r;
my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
);
wait_count++;
}
}
offset = it->offset+it->len;
if (it == v.end() || offset >= end_offset)
break;
}
}
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
{
// There is an unflushed big write. Copy small writes in its position
clean_loc = dirty_it->second.location;
skip_copy = true;
}
else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy)
{
// There is an unflushed delete
has_delete = true;
skip_copy = true;
}
else if (!IS_STABLE(dirty_it->second.state))
{
char err[1024];
snprintf(
err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d",
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
);
throw std::runtime_error(err);
}
if (dirty_it == bs->dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != cur.oid)
{
break;
}
wait_state += 1;
return false;
}
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete)
{
@ -283,16 +211,13 @@ bool journal_flusher_co::loop()
return true;
}
// Find it in clean_db
{
auto clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
old_clean_ver = (clean_it != bs->clean_db.end() ? clean_it->second.version : 0);
}
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
if (clean_loc == UINT64_MAX)
{
if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
{
// Object not present at all. This is a bug.
// Object not allocated. This is a bug.
char err[1024];
snprintf(
err, 1024, "BUG: Object %lu:%lu v%lu that we are trying to flush is not allocated on the data device",
@ -301,10 +226,10 @@ bool journal_flusher_co::loop()
throw std::runtime_error(err);
}
else
{
clean_loc = old_clean_loc;
}
}
else
has_delete = false;
// Also we need to submit metadata read(s). We do read-modify-write cycle(s) for every operation.
resume_2:
if (!modify_meta_read(clean_loc, meta_new, 2))
@ -339,9 +264,24 @@ bool journal_flusher_co::loop()
meta_old.it->second.state = 1;
bs->ringloop->wakeup();
}
// Reads completed, submit writes
// Reads completed, submit writes and set bitmap bits
if (bs->clean_entry_bitmap_size)
{
new_clean_bitmap = (bs->inmemory_meta
? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size);
if (clean_init_bitmap)
{
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len);
}
}
for (it = v.begin(); it != v.end(); it++)
{
if (new_clean_bitmap)
{
bitmap_set(new_clean_bitmap, it->offset, it->len);
}
await_sqe(4);
data->iov = (struct iovec){ it->buf, (size_t)it->len };
data->callback = simple_callback_w;
@ -374,7 +314,7 @@ bool journal_flusher_co::loop()
wait_state = 5;
return false;
}
((clean_disk_entry*)meta_old.buf)[meta_old.pos] = { 0 };
memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
await_sqe(15);
data->iov = (struct iovec){ meta_old.buf, META_BLOCK_SIZE };
data->callback = simple_callback_w;
@ -383,12 +323,20 @@ bool journal_flusher_co::loop()
);
wait_count++;
}
((clean_disk_entry*)meta_new.buf)[meta_new.pos] = has_delete
? (clean_disk_entry){ 0 }
: (clean_disk_entry){
.oid = cur.oid,
.version = cur.version,
};
if (has_delete)
{
memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
}
else
{
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
new_entry->oid = cur.oid;
new_entry->version = cur.version;
if (!bs->inmemory_meta)
{
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
}
}
await_sqe(6);
data->iov = (struct iovec){ meta_new.buf, META_BLOCK_SIZE };
data->callback = simple_callback_w;
@ -484,15 +432,109 @@ bool journal_flusher_co::loop()
return true;
}
bool journal_flusher_co::scan_dirty(int wait_base)
{
if (wait_state == wait_base)
{
goto resume_0;
}
dirty_it = dirty_end;
v.clear();
wait_count = 0;
copy_count = 0;
clean_loc = UINT64_MAX;
has_delete = false;
skip_copy = false;
clean_init_bitmap = false;
while (1)
{
if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
{
// First we submit all reads
offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len;
it = v.begin();
while (1)
{
for (; it != v.end(); it++)
if (it->offset >= offset)
break;
if (it == v.end() || it->offset > offset && it->len > 0)
{
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) });
copy_count++;
if (bs->journal.inmemory)
{
// Take it from memory
memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len);
}
else
{
// Read it from disk
await_sqe(0);
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
data->callback = simple_callback_r;
my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
);
wait_count++;
}
}
offset = it->offset+it->len;
if (it == v.end() || offset >= end_offset)
break;
}
}
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
{
// There is an unflushed big write. Copy small writes in its position
clean_loc = dirty_it->second.location;
clean_init_bitmap = true;
clean_bitmap_offset = dirty_it->second.offset;
clean_bitmap_len = dirty_it->second.len;
skip_copy = true;
}
else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy)
{
// There is an unflushed delete
has_delete = true;
skip_copy = true;
}
else if (!IS_STABLE(dirty_it->second.state))
{
char err[1024];
snprintf(
err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d",
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
);
throw std::runtime_error(err);
}
if (dirty_it == bs->dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != cur.oid)
{
break;
}
}
return true;
}
bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base)
{
if (wait_state == wait_base)
{
goto resume_0;
}
// We must check if the same sector is already in memory if we don't keep all metadata in memory all the time.
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
// so I'll avoid it as long as I can.
wr.sector = ((meta_loc >> bs->block_order) / (META_BLOCK_SIZE / sizeof(clean_disk_entry))) * META_BLOCK_SIZE;
wr.pos = ((meta_loc >> bs->block_order) % (META_BLOCK_SIZE / sizeof(clean_disk_entry)));
wr.sector = ((meta_loc >> bs->block_order) / (META_BLOCK_SIZE / bs->clean_entry_size)) * META_BLOCK_SIZE;
wr.pos = ((meta_loc >> bs->block_order) % (META_BLOCK_SIZE / bs->clean_entry_size));
if (bs->inmemory_meta)
{
wr.buf = bs->metadata_buffer + wr.sector;
@ -643,3 +685,35 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
}
return true;
}
void journal_flusher_co::bitmap_set(void *bitmap, uint64_t start, uint64_t len)
{
if (start == 0)
{
if (len == 32*BITMAP_GRANULARITY)
{
*((uint32_t*)bitmap) = 1;
return;
}
else if (len == 64*BITMAP_GRANULARITY)
{
*((uint64_t*)bitmap) = 1;
return;
}
}
unsigned bit_start = start / BITMAP_GRANULARITY;
unsigned bit_end = ((start + len) + BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
while (bit_start < bit_end)
{
if (!(bit_start & 7) && bit_end >= bit_start+8)
{
((uint8_t*)bitmap)[bit_start / 8] = 1;
bit_start += 8;
}
else
{
((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
bit_start++;
}
}
}

View File

@ -37,21 +37,34 @@ class journal_flusher_co
int wait_state, wait_count;
struct io_uring_sqe *sqe;
struct ring_data_t *data;
bool skip_copy, has_delete;
std::list<flusher_sync_t>::iterator cur_sync;
obj_ver_id cur;
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_end;
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
bool skip_copy, has_delete;
spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;
int copy_count;
uint64_t offset, end_offset, submit_offset, submit_len, clean_loc, old_clean_loc, old_clean_ver;
uint64_t clean_loc, old_clean_loc;
flusher_meta_write_t meta_old, meta_new;
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
std::list<flusher_sync_t>::iterator cur_sync;
bool clean_init_bitmap;
uint64_t clean_bitmap_offset, clean_bitmap_len;
void *new_clean_bitmap;
// local: scan_dirty()
uint64_t offset, end_offset, submit_offset, submit_len;
friend class journal_flusher_t;
bool scan_dirty(int wait_base);
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
void update_clean_db();
bool fsync_batch(bool fsync_meta, int wait_base);
void bitmap_set(void *bitmap, uint64_t start, uint64_t len);
public:
journal_flusher_co();
bool loop();

View File

@ -57,6 +57,8 @@ blockstore_impl_t::~blockstore_impl_t()
close(journal.fd);
if (metadata_buffer)
free(metadata_buffer);
if (clean_bitmap)
free(clean_bitmap);
}
bool blockstore_impl_t::is_started()

View File

@ -23,10 +23,19 @@
//#define BLOCKSTORE_DEBUG
#define DISK_ALIGNMENT 512
// Memory alignment for direct I/O (usually 512 bytes)
// All other alignments must be a multiple of this one
#define MEM_ALIGNMENT 512
// FIXME: Make following constants configurable
// Required write alignment and journal/metadata/data areas' location alignment
#define DISK_ALIGNMENT 512
// Journal block size - minimum_io_size of the journal device is the best choice
#define JOURNAL_BLOCK_SIZE 512
// Metadata block size - minimum_io_size of the metadata device is the best choice
#define META_BLOCK_SIZE 512
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple
// of the write alignment.
#define BITMAP_GRANULARITY 4096
// States are not stored on disk. Instead, they're deduced from the journal
@ -83,12 +92,13 @@
#include "blockstore_journal.h"
// 24 bytes per "clean" entry on disk with fixed metadata tables
// 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables
// FIXME: maybe add crc32's to metadata
struct __attribute__((__packed__)) clean_disk_entry
{
object_id oid;
uint64_t version;
uint8_t bitmap[];
};
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
@ -177,6 +187,7 @@ class blockstore_impl_t
// Another option is https://github.com/algorithm-ninja/cpp-btree
spp::sparse_hash_map<object_id, clean_entry> clean_db;
uint8_t *clean_bitmap = NULL;
std::map<obj_ver_id, dirty_entry> dirty_db;
std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
@ -186,6 +197,7 @@ class blockstore_impl_t
uint64_t block_count;
uint32_t block_order, block_size;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
int meta_fd;
int data_fd;
@ -197,10 +209,6 @@ class blockstore_impl_t
// FIXME: separate flags for data, metadata and journal
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_fsync = false;
// It is safe to disable zero fill if drive is zeroed before formatting.
// For example, with TRIM and Deterministic Read Zeroes after TRIM.
// FIXME: OP_DELETE should trim/zero out the block.
bool zerofill_enabled = false;
bool inmemory_meta = false;
void *metadata_buffer = NULL;

View File

@ -65,12 +65,11 @@ int blockstore_init_meta::loop()
void *done_buf = bs->inmemory_meta
? (metadata_buffer + done_pos)
: (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
unsigned count = META_BLOCK_SIZE / sizeof(clean_disk_entry);
unsigned count = META_BLOCK_SIZE / bs->clean_entry_size;
for (int sector = 0; sector < done_len; sector += META_BLOCK_SIZE)
{
clean_disk_entry *entries = (clean_disk_entry*)(done_buf + sector);
// handle <count> entries
handle_entries(entries, count, bs->block_order);
handle_entries(done_buf + sector, count, bs->block_order);
done_cnt += count;
}
prev_done = 0;
@ -91,14 +90,19 @@ int blockstore_init_meta::loop()
return 0;
}
void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order)
void blockstore_init_meta::handle_entries(void* entries, unsigned count, int block_order)
{
for (unsigned i = 0; i < count; i++)
{
if (entries[i].oid.inode > 0)
clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
{
auto clean_it = bs->clean_db.find(entries[i].oid);
if (clean_it == bs->clean_db.end() || clean_it->second.version < entries[i].version)
memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size);
}
if (entry->oid.inode > 0)
{
auto clean_it = bs->clean_db.find(entry->oid);
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
{
if (clean_it != bs->clean_db.end())
{
@ -110,18 +114,18 @@ void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, unsi
}
entries_loaded++;
#ifdef BLOCKSTORE_DEBUG
printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
#endif
bs->data_alloc->set(done_cnt+i, true);
bs->clean_db[entries[i].oid] = (struct clean_entry){
.version = entries[i].version,
bs->clean_db[entry->oid] = (struct clean_entry){
.version = entry->version,
.location = (done_cnt+i) << block_order,
};
}
else
{
#ifdef BLOCKSTORE_DEBUG
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
#endif
}
}

View File

@ -11,7 +11,7 @@ class blockstore_init_meta
uint64_t entries_loaded = 0;
struct io_uring_sqe *sqe;
struct ring_data_t *data;
void handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order);
void handle_entries(void *entries, unsigned count, int block_order);
void handle_event(ring_data_t *data);
public:
blockstore_init_meta(blockstore_impl_t *bs);

View File

@ -10,10 +10,6 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config)
{
disable_fsync = true;
}
if (config["zerofill"] == "true" || config["zerofill"] == "1" || config["zerofill"] == "yes")
{
zerofill_enabled = true;
}
// data
data_len = data_size - data_offset;
if (data_fd == meta_fd && data_offset < meta_offset)
@ -48,8 +44,18 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config)
? journal.len : meta_offset-journal.offset;
}
// required metadata size
if (BITMAP_GRANULARITY % DISK_ALIGNMENT)
{
throw std::runtime_error("Sparse write tracking granularity must be a multiple of write alignment");
}
if (block_size % BITMAP_GRANULARITY)
{
throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
}
clean_entry_bitmap_size = block_size / BITMAP_GRANULARITY / 8;
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
block_count = data_len / block_size;
meta_len = ((block_count - 1 + META_BLOCK_SIZE / sizeof(clean_disk_entry)) / (META_BLOCK_SIZE / sizeof(clean_disk_entry))) * META_BLOCK_SIZE;
meta_len = ((block_count - 1 + META_BLOCK_SIZE / clean_entry_size) / (META_BLOCK_SIZE / clean_entry_size)) * META_BLOCK_SIZE;
if (meta_area < meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
@ -64,7 +70,13 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config)
{
metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
if (!metadata_buffer)
throw std::runtime_error("Failed to allocate memory for metadata");
throw std::runtime_error("Failed to allocate memory for the metadata");
}
else if (clean_entry_bitmap_size)
{
clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size);
if (!clean_bitmap)
throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
}
// requested journal size
uint64_t journal_wanted = strtoull(config["journal_size"].c_str(), NULL, 10);

View File

@ -60,11 +60,11 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
.len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
};
it = PRIV(read_op)->read_vec.insert(it, el);
fulfilled += el.len;
if (!fulfill_read_push(read_op, read_op->buf + el.offset - read_op->offset, item_location + el.offset - item_start, el.len, item_state, item_version))
{
return 0;
}
fulfilled += el.len;
}
cur_start = it->offset + it->len;
if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end)
@ -97,7 +97,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
PRIV(read_op)->pending_ops = 0;
if (dirty_found)
{
while (dirty_it->first.oid == read_op->oid)
while (dirty_it->first.oid == read_op->oid && fulfilled < read_op->len)
{
dirty_entry& dirty = dirty_it->second;
bool version_ok = read_op->version >= dirty_it->first.version;
@ -124,13 +124,55 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
dirty_it--;
}
}
if (clean_it != clean_db.end())
if (clean_it != clean_db.end() && fulfilled < read_op->len)
{
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
if (!clean_entry_bitmap_size)
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
}
else
{
uint64_t meta_loc = clean_it->second.location >> block_order;
uint8_t *clean_entry_bitmap;
if (inmemory_meta)
{
uint64_t sector = (meta_loc / (META_BLOCK_SIZE / clean_entry_size)) * META_BLOCK_SIZE;
uint64_t pos = (meta_loc % (META_BLOCK_SIZE / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
}
else
{
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
}
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/BITMAP_GRANULARITY;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_start >> 3] & (1 << (bmp_start & 0x7))) && bmp_start < bmp_size)
{
bmp_start++;
}
bmp_end = bmp_start;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * BITMAP_GRANULARITY,
(bmp_end - bmp_start) * BITMAP_GRANULARITY, ST_CURRENT, 0, clean_it->second.location + bmp_start * BITMAP_GRANULARITY))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
bmp_start = bmp_end;
}
}
}
}
if (!PRIV(read_op)->pending_ops)

View File

@ -106,28 +106,24 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
printf("Allocate block %lu\n", loc);
#endif
data_alloc->set(loc, true);
uint64_t stripe_offset = (op->offset % BITMAP_GRANULARITY);
uint64_t stripe_end = (op->offset + op->len) % BITMAP_GRANULARITY;
// Zero fill up to BITMAP_GRANULARITY
int vcnt = 0;
uint64_t stripe_offset = 0;
if (op->len != block_size && zerofill_enabled)
if (stripe_offset)
{
// Zero fill newly allocated object if required
if (op->offset > 0)
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, op->offset };
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
if (op->offset+op->len < block_size)
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, block_size - (op->offset + op->len) };
data->iov.iov_len = block_size;
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset };
}
else
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
if (stripe_end)
{
vcnt = 1;
PRIV(op)->iov_zerofill[0] = (struct iovec){ op->buf, op->len };
data->iov.iov_len = op->len; // to check it in the callback
stripe_offset = op->offset;
stripe_end = BITMAP_GRANULARITY - stripe_end;
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
}
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
my_uring_prep_writev(
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + stripe_offset
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
);
PRIV(op)->pending_ops = 1;
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;