forked from vitalif/vitastor
Implement sparse block bitmap to avoid zero-fill
parent
4b05bde3a2
commit
cf819eb442
|
@ -188,85 +188,13 @@ bool journal_flusher_co::loop()
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Flushing %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
printf("Flushing %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||||
#endif
|
#endif
|
||||||
dirty_it = dirty_end;
|
|
||||||
flusher->active_flushers++;
|
flusher->active_flushers++;
|
||||||
v.clear();
|
resume_1:
|
||||||
wait_count = 0;
|
// Scan dirty versions of the object
|
||||||
copy_count = 0;
|
if (!scan_dirty(1))
|
||||||
clean_loc = UINT64_MAX;
|
|
||||||
has_delete = false;
|
|
||||||
skip_copy = false;
|
|
||||||
while (1)
|
|
||||||
{
|
{
|
||||||
if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
|
wait_state += 1;
|
||||||
{
|
return false;
|
||||||
// First we submit all reads
|
|
||||||
offset = dirty_it->second.offset;
|
|
||||||
end_offset = dirty_it->second.offset + dirty_it->second.len;
|
|
||||||
it = v.begin();
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
for (; it != v.end(); it++)
|
|
||||||
if (it->offset >= offset)
|
|
||||||
break;
|
|
||||||
if (it == v.end() || it->offset > offset && it->len > 0)
|
|
||||||
{
|
|
||||||
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
|
|
||||||
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
|
|
||||||
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) });
|
|
||||||
copy_count++;
|
|
||||||
if (bs->journal.inmemory)
|
|
||||||
{
|
|
||||||
// Take it from memory
|
|
||||||
memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Read it from disk
|
|
||||||
await_sqe(1);
|
|
||||||
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
|
|
||||||
data->callback = simple_callback_r;
|
|
||||||
my_uring_prep_readv(
|
|
||||||
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
|
|
||||||
);
|
|
||||||
wait_count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
offset = it->offset+it->len;
|
|
||||||
if (it == v.end() || offset >= end_offset)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
|
|
||||||
{
|
|
||||||
// There is an unflushed big write. Copy small writes in its position
|
|
||||||
clean_loc = dirty_it->second.location;
|
|
||||||
skip_copy = true;
|
|
||||||
}
|
|
||||||
else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy)
|
|
||||||
{
|
|
||||||
// There is an unflushed delete
|
|
||||||
has_delete = true;
|
|
||||||
skip_copy = true;
|
|
||||||
}
|
|
||||||
else if (!IS_STABLE(dirty_it->second.state))
|
|
||||||
{
|
|
||||||
char err[1024];
|
|
||||||
snprintf(
|
|
||||||
err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d",
|
|
||||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
|
||||||
);
|
|
||||||
throw std::runtime_error(err);
|
|
||||||
}
|
|
||||||
if (dirty_it == bs->dirty_db.begin())
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
dirty_it--;
|
|
||||||
if (dirty_it->first.oid != cur.oid)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete)
|
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete)
|
||||||
{
|
{
|
||||||
|
@ -283,16 +211,13 @@ bool journal_flusher_co::loop()
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// Find it in clean_db
|
// Find it in clean_db
|
||||||
{
|
clean_it = bs->clean_db.find(cur.oid);
|
||||||
auto clean_it = bs->clean_db.find(cur.oid);
|
|
||||||
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||||
old_clean_ver = (clean_it != bs->clean_db.end() ? clean_it->second.version : 0);
|
|
||||||
}
|
|
||||||
if (clean_loc == UINT64_MAX)
|
if (clean_loc == UINT64_MAX)
|
||||||
{
|
{
|
||||||
if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
|
if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
|
||||||
{
|
{
|
||||||
// Object not present at all. This is a bug.
|
// Object not allocated. This is a bug.
|
||||||
char err[1024];
|
char err[1024];
|
||||||
snprintf(
|
snprintf(
|
||||||
err, 1024, "BUG: Object %lu:%lu v%lu that we are trying to flush is not allocated on the data device",
|
err, 1024, "BUG: Object %lu:%lu v%lu that we are trying to flush is not allocated on the data device",
|
||||||
|
@ -301,10 +226,10 @@ bool journal_flusher_co::loop()
|
||||||
throw std::runtime_error(err);
|
throw std::runtime_error(err);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
clean_loc = old_clean_loc;
|
clean_loc = old_clean_loc;
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
has_delete = false;
|
|
||||||
// Also we need to submit metadata read(s). We do read-modify-write cycle(s) for every operation.
|
// Also we need to submit metadata read(s). We do read-modify-write cycle(s) for every operation.
|
||||||
resume_2:
|
resume_2:
|
||||||
if (!modify_meta_read(clean_loc, meta_new, 2))
|
if (!modify_meta_read(clean_loc, meta_new, 2))
|
||||||
|
@ -339,9 +264,24 @@ bool journal_flusher_co::loop()
|
||||||
meta_old.it->second.state = 1;
|
meta_old.it->second.state = 1;
|
||||||
bs->ringloop->wakeup();
|
bs->ringloop->wakeup();
|
||||||
}
|
}
|
||||||
// Reads completed, submit writes
|
// Reads completed, submit writes and set bitmap bits
|
||||||
|
if (bs->clean_entry_bitmap_size)
|
||||||
|
{
|
||||||
|
new_clean_bitmap = (bs->inmemory_meta
|
||||||
|
? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
|
||||||
|
: bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size);
|
||||||
|
if (clean_init_bitmap)
|
||||||
|
{
|
||||||
|
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
|
||||||
|
bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
for (it = v.begin(); it != v.end(); it++)
|
for (it = v.begin(); it != v.end(); it++)
|
||||||
{
|
{
|
||||||
|
if (new_clean_bitmap)
|
||||||
|
{
|
||||||
|
bitmap_set(new_clean_bitmap, it->offset, it->len);
|
||||||
|
}
|
||||||
await_sqe(4);
|
await_sqe(4);
|
||||||
data->iov = (struct iovec){ it->buf, (size_t)it->len };
|
data->iov = (struct iovec){ it->buf, (size_t)it->len };
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
|
@ -374,7 +314,7 @@ bool journal_flusher_co::loop()
|
||||||
wait_state = 5;
|
wait_state = 5;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
((clean_disk_entry*)meta_old.buf)[meta_old.pos] = { 0 };
|
memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
|
||||||
await_sqe(15);
|
await_sqe(15);
|
||||||
data->iov = (struct iovec){ meta_old.buf, META_BLOCK_SIZE };
|
data->iov = (struct iovec){ meta_old.buf, META_BLOCK_SIZE };
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
|
@ -383,12 +323,20 @@ bool journal_flusher_co::loop()
|
||||||
);
|
);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
}
|
}
|
||||||
((clean_disk_entry*)meta_new.buf)[meta_new.pos] = has_delete
|
if (has_delete)
|
||||||
? (clean_disk_entry){ 0 }
|
{
|
||||||
: (clean_disk_entry){
|
memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
|
||||||
.oid = cur.oid,
|
}
|
||||||
.version = cur.version,
|
else
|
||||||
};
|
{
|
||||||
|
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
|
||||||
|
new_entry->oid = cur.oid;
|
||||||
|
new_entry->version = cur.version;
|
||||||
|
if (!bs->inmemory_meta)
|
||||||
|
{
|
||||||
|
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
await_sqe(6);
|
await_sqe(6);
|
||||||
data->iov = (struct iovec){ meta_new.buf, META_BLOCK_SIZE };
|
data->iov = (struct iovec){ meta_new.buf, META_BLOCK_SIZE };
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
|
@ -484,15 +432,109 @@ bool journal_flusher_co::loop()
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool journal_flusher_co::scan_dirty(int wait_base)
|
||||||
|
{
|
||||||
|
if (wait_state == wait_base)
|
||||||
|
{
|
||||||
|
goto resume_0;
|
||||||
|
}
|
||||||
|
dirty_it = dirty_end;
|
||||||
|
v.clear();
|
||||||
|
wait_count = 0;
|
||||||
|
copy_count = 0;
|
||||||
|
clean_loc = UINT64_MAX;
|
||||||
|
has_delete = false;
|
||||||
|
skip_copy = false;
|
||||||
|
clean_init_bitmap = false;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
|
||||||
|
{
|
||||||
|
// First we submit all reads
|
||||||
|
offset = dirty_it->second.offset;
|
||||||
|
end_offset = dirty_it->second.offset + dirty_it->second.len;
|
||||||
|
it = v.begin();
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
for (; it != v.end(); it++)
|
||||||
|
if (it->offset >= offset)
|
||||||
|
break;
|
||||||
|
if (it == v.end() || it->offset > offset && it->len > 0)
|
||||||
|
{
|
||||||
|
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
|
||||||
|
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
|
||||||
|
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) });
|
||||||
|
copy_count++;
|
||||||
|
if (bs->journal.inmemory)
|
||||||
|
{
|
||||||
|
// Take it from memory
|
||||||
|
memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Read it from disk
|
||||||
|
await_sqe(0);
|
||||||
|
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
|
||||||
|
data->callback = simple_callback_r;
|
||||||
|
my_uring_prep_readv(
|
||||||
|
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
|
||||||
|
);
|
||||||
|
wait_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
offset = it->offset+it->len;
|
||||||
|
if (it == v.end() || offset >= end_offset)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
|
||||||
|
{
|
||||||
|
// There is an unflushed big write. Copy small writes in its position
|
||||||
|
clean_loc = dirty_it->second.location;
|
||||||
|
clean_init_bitmap = true;
|
||||||
|
clean_bitmap_offset = dirty_it->second.offset;
|
||||||
|
clean_bitmap_len = dirty_it->second.len;
|
||||||
|
skip_copy = true;
|
||||||
|
}
|
||||||
|
else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy)
|
||||||
|
{
|
||||||
|
// There is an unflushed delete
|
||||||
|
has_delete = true;
|
||||||
|
skip_copy = true;
|
||||||
|
}
|
||||||
|
else if (!IS_STABLE(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
char err[1024];
|
||||||
|
snprintf(
|
||||||
|
err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d",
|
||||||
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
||||||
|
);
|
||||||
|
throw std::runtime_error(err);
|
||||||
|
}
|
||||||
|
if (dirty_it == bs->dirty_db.begin())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid != cur.oid)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base)
|
bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base)
|
||||||
{
|
{
|
||||||
if (wait_state == wait_base)
|
if (wait_state == wait_base)
|
||||||
|
{
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
|
}
|
||||||
// We must check if the same sector is already in memory if we don't keep all metadata in memory all the time.
|
// We must check if the same sector is already in memory if we don't keep all metadata in memory all the time.
|
||||||
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
|
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
|
||||||
// so I'll avoid it as long as I can.
|
// so I'll avoid it as long as I can.
|
||||||
wr.sector = ((meta_loc >> bs->block_order) / (META_BLOCK_SIZE / sizeof(clean_disk_entry))) * META_BLOCK_SIZE;
|
wr.sector = ((meta_loc >> bs->block_order) / (META_BLOCK_SIZE / bs->clean_entry_size)) * META_BLOCK_SIZE;
|
||||||
wr.pos = ((meta_loc >> bs->block_order) % (META_BLOCK_SIZE / sizeof(clean_disk_entry)));
|
wr.pos = ((meta_loc >> bs->block_order) % (META_BLOCK_SIZE / bs->clean_entry_size));
|
||||||
if (bs->inmemory_meta)
|
if (bs->inmemory_meta)
|
||||||
{
|
{
|
||||||
wr.buf = bs->metadata_buffer + wr.sector;
|
wr.buf = bs->metadata_buffer + wr.sector;
|
||||||
|
@ -643,3 +685,35 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void journal_flusher_co::bitmap_set(void *bitmap, uint64_t start, uint64_t len)
|
||||||
|
{
|
||||||
|
if (start == 0)
|
||||||
|
{
|
||||||
|
if (len == 32*BITMAP_GRANULARITY)
|
||||||
|
{
|
||||||
|
*((uint32_t*)bitmap) = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else if (len == 64*BITMAP_GRANULARITY)
|
||||||
|
{
|
||||||
|
*((uint64_t*)bitmap) = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unsigned bit_start = start / BITMAP_GRANULARITY;
|
||||||
|
unsigned bit_end = ((start + len) + BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
|
||||||
|
while (bit_start < bit_end)
|
||||||
|
{
|
||||||
|
if (!(bit_start & 7) && bit_end >= bit_start+8)
|
||||||
|
{
|
||||||
|
((uint8_t*)bitmap)[bit_start / 8] = 1;
|
||||||
|
bit_start += 8;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
|
||||||
|
bit_start++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -37,21 +37,34 @@ class journal_flusher_co
|
||||||
int wait_state, wait_count;
|
int wait_state, wait_count;
|
||||||
struct io_uring_sqe *sqe;
|
struct io_uring_sqe *sqe;
|
||||||
struct ring_data_t *data;
|
struct ring_data_t *data;
|
||||||
bool skip_copy, has_delete;
|
|
||||||
|
std::list<flusher_sync_t>::iterator cur_sync;
|
||||||
|
|
||||||
obj_ver_id cur;
|
obj_ver_id cur;
|
||||||
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
|
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_end;
|
||||||
|
std::map<object_id, uint64_t>::iterator repeat_it;
|
||||||
|
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||||
|
|
||||||
|
bool skip_copy, has_delete;
|
||||||
|
spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it;
|
||||||
std::vector<copy_buffer_t> v;
|
std::vector<copy_buffer_t> v;
|
||||||
std::vector<copy_buffer_t>::iterator it;
|
std::vector<copy_buffer_t>::iterator it;
|
||||||
int copy_count;
|
int copy_count;
|
||||||
uint64_t offset, end_offset, submit_offset, submit_len, clean_loc, old_clean_loc, old_clean_ver;
|
uint64_t clean_loc, old_clean_loc;
|
||||||
flusher_meta_write_t meta_old, meta_new;
|
flusher_meta_write_t meta_old, meta_new;
|
||||||
std::map<object_id, uint64_t>::iterator repeat_it;
|
bool clean_init_bitmap;
|
||||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
uint64_t clean_bitmap_offset, clean_bitmap_len;
|
||||||
std::list<flusher_sync_t>::iterator cur_sync;
|
void *new_clean_bitmap;
|
||||||
|
|
||||||
|
// local: scan_dirty()
|
||||||
|
uint64_t offset, end_offset, submit_offset, submit_len;
|
||||||
|
|
||||||
friend class journal_flusher_t;
|
friend class journal_flusher_t;
|
||||||
|
bool scan_dirty(int wait_base);
|
||||||
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
|
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
|
||||||
void update_clean_db();
|
void update_clean_db();
|
||||||
bool fsync_batch(bool fsync_meta, int wait_base);
|
bool fsync_batch(bool fsync_meta, int wait_base);
|
||||||
|
void bitmap_set(void *bitmap, uint64_t start, uint64_t len);
|
||||||
public:
|
public:
|
||||||
journal_flusher_co();
|
journal_flusher_co();
|
||||||
bool loop();
|
bool loop();
|
||||||
|
|
|
@ -57,6 +57,8 @@ blockstore_impl_t::~blockstore_impl_t()
|
||||||
close(journal.fd);
|
close(journal.fd);
|
||||||
if (metadata_buffer)
|
if (metadata_buffer)
|
||||||
free(metadata_buffer);
|
free(metadata_buffer);
|
||||||
|
if (clean_bitmap)
|
||||||
|
free(clean_bitmap);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool blockstore_impl_t::is_started()
|
bool blockstore_impl_t::is_started()
|
||||||
|
|
|
@ -23,10 +23,19 @@
|
||||||
|
|
||||||
//#define BLOCKSTORE_DEBUG
|
//#define BLOCKSTORE_DEBUG
|
||||||
|
|
||||||
#define DISK_ALIGNMENT 512
|
// Memory alignment for direct I/O (usually 512 bytes)
|
||||||
|
// All other alignments must be a multiple of this one
|
||||||
#define MEM_ALIGNMENT 512
|
#define MEM_ALIGNMENT 512
|
||||||
|
// FIXME: Make following constants configurable
|
||||||
|
// Required write alignment and journal/metadata/data areas' location alignment
|
||||||
|
#define DISK_ALIGNMENT 512
|
||||||
|
// Journal block size - minimum_io_size of the journal device is the best choice
|
||||||
#define JOURNAL_BLOCK_SIZE 512
|
#define JOURNAL_BLOCK_SIZE 512
|
||||||
|
// Metadata block size - minimum_io_size of the metadata device is the best choice
|
||||||
#define META_BLOCK_SIZE 512
|
#define META_BLOCK_SIZE 512
|
||||||
|
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple
|
||||||
|
// of the write alignment.
|
||||||
|
#define BITMAP_GRANULARITY 4096
|
||||||
|
|
||||||
// States are not stored on disk. Instead, they're deduced from the journal
|
// States are not stored on disk. Instead, they're deduced from the journal
|
||||||
|
|
||||||
|
@ -83,12 +92,13 @@
|
||||||
|
|
||||||
#include "blockstore_journal.h"
|
#include "blockstore_journal.h"
|
||||||
|
|
||||||
// 24 bytes per "clean" entry on disk with fixed metadata tables
|
// 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables
|
||||||
// FIXME: maybe add crc32's to metadata
|
// FIXME: maybe add crc32's to metadata
|
||||||
struct __attribute__((__packed__)) clean_disk_entry
|
struct __attribute__((__packed__)) clean_disk_entry
|
||||||
{
|
{
|
||||||
object_id oid;
|
object_id oid;
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
|
uint8_t bitmap[];
|
||||||
};
|
};
|
||||||
|
|
||||||
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
|
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
|
||||||
|
@ -177,6 +187,7 @@ class blockstore_impl_t
|
||||||
|
|
||||||
// Another option is https://github.com/algorithm-ninja/cpp-btree
|
// Another option is https://github.com/algorithm-ninja/cpp-btree
|
||||||
spp::sparse_hash_map<object_id, clean_entry> clean_db;
|
spp::sparse_hash_map<object_id, clean_entry> clean_db;
|
||||||
|
uint8_t *clean_bitmap = NULL;
|
||||||
std::map<obj_ver_id, dirty_entry> dirty_db;
|
std::map<obj_ver_id, dirty_entry> dirty_db;
|
||||||
std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
|
std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
|
||||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||||
|
@ -186,6 +197,7 @@ class blockstore_impl_t
|
||||||
|
|
||||||
uint64_t block_count;
|
uint64_t block_count;
|
||||||
uint32_t block_order, block_size;
|
uint32_t block_order, block_size;
|
||||||
|
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
|
||||||
|
|
||||||
int meta_fd;
|
int meta_fd;
|
||||||
int data_fd;
|
int data_fd;
|
||||||
|
@ -197,10 +209,6 @@ class blockstore_impl_t
|
||||||
// FIXME: separate flags for data, metadata and journal
|
// FIXME: separate flags for data, metadata and journal
|
||||||
// It is safe to disable fsync() if drive write cache is writethrough
|
// It is safe to disable fsync() if drive write cache is writethrough
|
||||||
bool disable_fsync = false;
|
bool disable_fsync = false;
|
||||||
// It is safe to disable zero fill if drive is zeroed before formatting.
|
|
||||||
// For example, with TRIM and Deterministic Read Zeroes after TRIM.
|
|
||||||
// FIXME: OP_DELETE should trim/zero out the block.
|
|
||||||
bool zerofill_enabled = false;
|
|
||||||
bool inmemory_meta = false;
|
bool inmemory_meta = false;
|
||||||
void *metadata_buffer = NULL;
|
void *metadata_buffer = NULL;
|
||||||
|
|
||||||
|
|
|
@ -65,12 +65,11 @@ int blockstore_init_meta::loop()
|
||||||
void *done_buf = bs->inmemory_meta
|
void *done_buf = bs->inmemory_meta
|
||||||
? (metadata_buffer + done_pos)
|
? (metadata_buffer + done_pos)
|
||||||
: (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
|
: (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
|
||||||
unsigned count = META_BLOCK_SIZE / sizeof(clean_disk_entry);
|
unsigned count = META_BLOCK_SIZE / bs->clean_entry_size;
|
||||||
for (int sector = 0; sector < done_len; sector += META_BLOCK_SIZE)
|
for (int sector = 0; sector < done_len; sector += META_BLOCK_SIZE)
|
||||||
{
|
{
|
||||||
clean_disk_entry *entries = (clean_disk_entry*)(done_buf + sector);
|
|
||||||
// handle <count> entries
|
// handle <count> entries
|
||||||
handle_entries(entries, count, bs->block_order);
|
handle_entries(done_buf + sector, count, bs->block_order);
|
||||||
done_cnt += count;
|
done_cnt += count;
|
||||||
}
|
}
|
||||||
prev_done = 0;
|
prev_done = 0;
|
||||||
|
@ -91,14 +90,19 @@ int blockstore_init_meta::loop()
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order)
|
void blockstore_init_meta::handle_entries(void* entries, unsigned count, int block_order)
|
||||||
{
|
{
|
||||||
for (unsigned i = 0; i < count; i++)
|
for (unsigned i = 0; i < count; i++)
|
||||||
{
|
{
|
||||||
if (entries[i].oid.inode > 0)
|
clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
|
||||||
|
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
|
||||||
{
|
{
|
||||||
auto clean_it = bs->clean_db.find(entries[i].oid);
|
memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size);
|
||||||
if (clean_it == bs->clean_db.end() || clean_it->second.version < entries[i].version)
|
}
|
||||||
|
if (entry->oid.inode > 0)
|
||||||
|
{
|
||||||
|
auto clean_it = bs->clean_db.find(entry->oid);
|
||||||
|
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
|
||||||
{
|
{
|
||||||
if (clean_it != bs->clean_db.end())
|
if (clean_it != bs->clean_db.end())
|
||||||
{
|
{
|
||||||
|
@ -110,18 +114,18 @@ void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, unsi
|
||||||
}
|
}
|
||||||
entries_loaded++;
|
entries_loaded++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
|
printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||||
#endif
|
#endif
|
||||||
bs->data_alloc->set(done_cnt+i, true);
|
bs->data_alloc->set(done_cnt+i, true);
|
||||||
bs->clean_db[entries[i].oid] = (struct clean_entry){
|
bs->clean_db[entry->oid] = (struct clean_entry){
|
||||||
.version = entries[i].version,
|
.version = entry->version,
|
||||||
.location = (done_cnt+i) << block_order,
|
.location = (done_cnt+i) << block_order,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
|
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@ class blockstore_init_meta
|
||||||
uint64_t entries_loaded = 0;
|
uint64_t entries_loaded = 0;
|
||||||
struct io_uring_sqe *sqe;
|
struct io_uring_sqe *sqe;
|
||||||
struct ring_data_t *data;
|
struct ring_data_t *data;
|
||||||
void handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order);
|
void handle_entries(void *entries, unsigned count, int block_order);
|
||||||
void handle_event(ring_data_t *data);
|
void handle_event(ring_data_t *data);
|
||||||
public:
|
public:
|
||||||
blockstore_init_meta(blockstore_impl_t *bs);
|
blockstore_init_meta(blockstore_impl_t *bs);
|
||||||
|
|
|
@ -10,10 +10,6 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config)
|
||||||
{
|
{
|
||||||
disable_fsync = true;
|
disable_fsync = true;
|
||||||
}
|
}
|
||||||
if (config["zerofill"] == "true" || config["zerofill"] == "1" || config["zerofill"] == "yes")
|
|
||||||
{
|
|
||||||
zerofill_enabled = true;
|
|
||||||
}
|
|
||||||
// data
|
// data
|
||||||
data_len = data_size - data_offset;
|
data_len = data_size - data_offset;
|
||||||
if (data_fd == meta_fd && data_offset < meta_offset)
|
if (data_fd == meta_fd && data_offset < meta_offset)
|
||||||
|
@ -48,8 +44,18 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config)
|
||||||
? journal.len : meta_offset-journal.offset;
|
? journal.len : meta_offset-journal.offset;
|
||||||
}
|
}
|
||||||
// required metadata size
|
// required metadata size
|
||||||
|
if (BITMAP_GRANULARITY % DISK_ALIGNMENT)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Sparse write tracking granularity must be a multiple of write alignment");
|
||||||
|
}
|
||||||
|
if (block_size % BITMAP_GRANULARITY)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
|
||||||
|
}
|
||||||
|
clean_entry_bitmap_size = block_size / BITMAP_GRANULARITY / 8;
|
||||||
|
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
|
||||||
block_count = data_len / block_size;
|
block_count = data_len / block_size;
|
||||||
meta_len = ((block_count - 1 + META_BLOCK_SIZE / sizeof(clean_disk_entry)) / (META_BLOCK_SIZE / sizeof(clean_disk_entry))) * META_BLOCK_SIZE;
|
meta_len = ((block_count - 1 + META_BLOCK_SIZE / clean_entry_size) / (META_BLOCK_SIZE / clean_entry_size)) * META_BLOCK_SIZE;
|
||||||
if (meta_area < meta_len)
|
if (meta_area < meta_len)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
|
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
|
||||||
|
@ -64,7 +70,13 @@ void blockstore_impl_t::calc_lengths(blockstore_config_t & config)
|
||||||
{
|
{
|
||||||
metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
|
metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
|
||||||
if (!metadata_buffer)
|
if (!metadata_buffer)
|
||||||
throw std::runtime_error("Failed to allocate memory for metadata");
|
throw std::runtime_error("Failed to allocate memory for the metadata");
|
||||||
|
}
|
||||||
|
else if (clean_entry_bitmap_size)
|
||||||
|
{
|
||||||
|
clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size);
|
||||||
|
if (!clean_bitmap)
|
||||||
|
throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
|
||||||
}
|
}
|
||||||
// requested journal size
|
// requested journal size
|
||||||
uint64_t journal_wanted = strtoull(config["journal_size"].c_str(), NULL, 10);
|
uint64_t journal_wanted = strtoull(config["journal_size"].c_str(), NULL, 10);
|
||||||
|
|
|
@ -60,11 +60,11 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
|
||||||
.len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
|
.len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
|
||||||
};
|
};
|
||||||
it = PRIV(read_op)->read_vec.insert(it, el);
|
it = PRIV(read_op)->read_vec.insert(it, el);
|
||||||
fulfilled += el.len;
|
|
||||||
if (!fulfill_read_push(read_op, read_op->buf + el.offset - read_op->offset, item_location + el.offset - item_start, el.len, item_state, item_version))
|
if (!fulfill_read_push(read_op, read_op->buf + el.offset - read_op->offset, item_location + el.offset - item_start, el.len, item_state, item_version))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
fulfilled += el.len;
|
||||||
}
|
}
|
||||||
cur_start = it->offset + it->len;
|
cur_start = it->offset + it->len;
|
||||||
if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end)
|
if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end)
|
||||||
|
@ -97,7 +97,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||||
PRIV(read_op)->pending_ops = 0;
|
PRIV(read_op)->pending_ops = 0;
|
||||||
if (dirty_found)
|
if (dirty_found)
|
||||||
{
|
{
|
||||||
while (dirty_it->first.oid == read_op->oid)
|
while (dirty_it->first.oid == read_op->oid && fulfilled < read_op->len)
|
||||||
{
|
{
|
||||||
dirty_entry& dirty = dirty_it->second;
|
dirty_entry& dirty = dirty_it->second;
|
||||||
bool version_ok = read_op->version >= dirty_it->first.version;
|
bool version_ok = read_op->version >= dirty_it->first.version;
|
||||||
|
@ -124,7 +124,9 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||||
dirty_it--;
|
dirty_it--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (clean_it != clean_db.end())
|
if (clean_it != clean_db.end() && fulfilled < read_op->len)
|
||||||
|
{
|
||||||
|
if (!clean_entry_bitmap_size)
|
||||||
{
|
{
|
||||||
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
|
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
|
||||||
{
|
{
|
||||||
|
@ -133,6 +135,46 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
uint64_t meta_loc = clean_it->second.location >> block_order;
|
||||||
|
uint8_t *clean_entry_bitmap;
|
||||||
|
if (inmemory_meta)
|
||||||
|
{
|
||||||
|
uint64_t sector = (meta_loc / (META_BLOCK_SIZE / clean_entry_size)) * META_BLOCK_SIZE;
|
||||||
|
uint64_t pos = (meta_loc % (META_BLOCK_SIZE / clean_entry_size));
|
||||||
|
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/BITMAP_GRANULARITY;
|
||||||
|
while (bmp_start < bmp_size)
|
||||||
|
{
|
||||||
|
while (!(clean_entry_bitmap[bmp_start >> 3] & (1 << (bmp_start & 0x7))) && bmp_start < bmp_size)
|
||||||
|
{
|
||||||
|
bmp_start++;
|
||||||
|
}
|
||||||
|
bmp_end = bmp_start;
|
||||||
|
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
|
||||||
|
{
|
||||||
|
bmp_end++;
|
||||||
|
}
|
||||||
|
if (bmp_end > bmp_start)
|
||||||
|
{
|
||||||
|
if (!fulfill_read(read_op, fulfilled, bmp_start * BITMAP_GRANULARITY,
|
||||||
|
(bmp_end - bmp_start) * BITMAP_GRANULARITY, ST_CURRENT, 0, clean_it->second.location + bmp_start * BITMAP_GRANULARITY))
|
||||||
|
{
|
||||||
|
// need to wait. undo added requests, don't dequeue op
|
||||||
|
PRIV(read_op)->read_vec.clear();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
bmp_start = bmp_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!PRIV(read_op)->pending_ops)
|
if (!PRIV(read_op)->pending_ops)
|
||||||
{
|
{
|
||||||
// everything is fulfilled from memory
|
// everything is fulfilled from memory
|
||||||
|
|
|
@ -106,28 +106,24 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
printf("Allocate block %lu\n", loc);
|
printf("Allocate block %lu\n", loc);
|
||||||
#endif
|
#endif
|
||||||
data_alloc->set(loc, true);
|
data_alloc->set(loc, true);
|
||||||
|
uint64_t stripe_offset = (op->offset % BITMAP_GRANULARITY);
|
||||||
|
uint64_t stripe_end = (op->offset + op->len) % BITMAP_GRANULARITY;
|
||||||
|
// Zero fill up to BITMAP_GRANULARITY
|
||||||
int vcnt = 0;
|
int vcnt = 0;
|
||||||
uint64_t stripe_offset = 0;
|
if (stripe_offset)
|
||||||
if (op->len != block_size && zerofill_enabled)
|
|
||||||
{
|
{
|
||||||
// Zero fill newly allocated object if required
|
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset };
|
||||||
if (op->offset > 0)
|
}
|
||||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, op->offset };
|
|
||||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
|
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
|
||||||
if (op->offset+op->len < block_size)
|
if (stripe_end)
|
||||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, block_size - (op->offset + op->len) };
|
|
||||||
data->iov.iov_len = block_size;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
vcnt = 1;
|
stripe_end = BITMAP_GRANULARITY - stripe_end;
|
||||||
PRIV(op)->iov_zerofill[0] = (struct iovec){ op->buf, op->len };
|
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
|
||||||
data->iov.iov_len = op->len; // to check it in the callback
|
|
||||||
stripe_offset = op->offset;
|
|
||||||
}
|
}
|
||||||
|
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
|
||||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||||
my_uring_prep_writev(
|
my_uring_prep_writev(
|
||||||
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + stripe_offset
|
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
|
||||||
);
|
);
|
||||||
PRIV(op)->pending_ops = 1;
|
PRIV(op)->pending_ops = 1;
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
||||||
|
|
Loading…
Reference in New Issue