Fix linear overwrite, make metadata writes ordered, ignore older entries when recovering journal
parent
b6fff5a77e
commit
45f34fb3b2
|
@ -71,10 +71,6 @@ void allocator::set(uint64_t addr, bool value)
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
mask[last] = mask[last] & ~(1l << bit);
|
mask[last] = mask[last] & ~(1l << bit);
|
||||||
if (mask[last] != 0)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
is_last = false;
|
is_last = false;
|
||||||
if (p2 > 1)
|
if (p2 > 1)
|
||||||
|
|
|
@ -340,11 +340,11 @@ resume_0:
|
||||||
);
|
);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
}
|
}
|
||||||
// And a metadata write
|
|
||||||
resume_5:
|
resume_5:
|
||||||
if (meta_it->second.state == 0)
|
// And a metadata write, but only after data writes complete
|
||||||
|
if (meta_it->second.state == 0 || wait_count > 0)
|
||||||
{
|
{
|
||||||
// metadata sector is still being read, wait for it
|
// metadata sector is still being read or data is still being written, wait for it
|
||||||
wait_state = 5;
|
wait_state = 5;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -352,8 +352,6 @@ resume_0:
|
||||||
.oid = cur.oid,
|
.oid = cur.oid,
|
||||||
.version = cur.version,
|
.version = cur.version,
|
||||||
};
|
};
|
||||||
// I consider unordered writes to data & metadata safe here
|
|
||||||
// BUT it requires that journal entries even older than clean_db are replayed after restart
|
|
||||||
await_sqe(6);
|
await_sqe(6);
|
||||||
data->iov = (struct iovec){ meta_it->second.buf, 512 };
|
data->iov = (struct iovec){ meta_it->second.buf, 512 };
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
|
@ -432,6 +430,9 @@ resume_0:
|
||||||
// Update clean_db and dirty_db, free old data locations
|
// Update clean_db and dirty_db, free old data locations
|
||||||
if (old_clean_loc != clean_loc)
|
if (old_clean_loc != clean_loc)
|
||||||
{
|
{
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Free block %lu\n", old_clean_loc >> bs->block_order);
|
||||||
|
#endif
|
||||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||||
}
|
}
|
||||||
bs->clean_db[cur.oid] = {
|
bs->clean_db[cur.oid] = {
|
||||||
|
@ -443,6 +444,9 @@ resume_0:
|
||||||
{
|
{
|
||||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
||||||
{
|
{
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Free block %lu\n", dirty_it->second.location >> bs->block_order);
|
||||||
|
#endif
|
||||||
bs->data_alloc->set(dirty_it->second.location >> bs->block_order, false);
|
bs->data_alloc->set(dirty_it->second.location >> bs->block_order, false);
|
||||||
}
|
}
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
|
|
@ -85,23 +85,30 @@ void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, int
|
||||||
if (entries[i].oid.inode > 0)
|
if (entries[i].oid.inode > 0)
|
||||||
{
|
{
|
||||||
auto clean_it = bs->clean_db.find(entries[i].oid);
|
auto clean_it = bs->clean_db.find(entries[i].oid);
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Clean entry %u: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
|
|
||||||
#endif
|
|
||||||
if (clean_it == bs->clean_db.end() || clean_it->second.version < entries[i].version)
|
if (clean_it == bs->clean_db.end() || clean_it->second.version < entries[i].version)
|
||||||
{
|
{
|
||||||
if (clean_it != bs->clean_db.end())
|
if (clean_it != bs->clean_db.end())
|
||||||
{
|
{
|
||||||
// free the previous block
|
// free the previous block
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Free block %lu\n", clean_it->second.location >> bs->block_order);
|
||||||
|
#endif
|
||||||
bs->data_alloc->set(clean_it->second.location >> block_order, false);
|
bs->data_alloc->set(clean_it->second.location >> block_order, false);
|
||||||
}
|
}
|
||||||
entries_loaded++;
|
entries_loaded++;
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
|
||||||
|
#endif
|
||||||
bs->data_alloc->set(done_cnt+i, true);
|
bs->data_alloc->set(done_cnt+i, true);
|
||||||
bs->clean_db[entries[i].oid] = (struct clean_entry){
|
bs->clean_db[entries[i].oid] = (struct clean_entry){
|
||||||
.version = entries[i].version,
|
.version = entries[i].version,
|
||||||
.location = (done_cnt+i) << block_order,
|
.location = (done_cnt+i) << block_order,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
else
|
||||||
|
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -286,6 +293,7 @@ resume_1:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// FIXME Trim journal on start so we don't stall when all entries are older
|
||||||
printf("Journal entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count);
|
printf("Journal entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count);
|
||||||
if (!bs->journal.inmemory)
|
if (!bs->journal.inmemory)
|
||||||
{
|
{
|
||||||
|
@ -356,6 +364,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t len)
|
||||||
snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
|
snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
|
||||||
throw std::runtime_error(err);
|
throw std::runtime_error(err);
|
||||||
}
|
}
|
||||||
|
auto clean_it = bs->clean_db.find(je->small_write.oid);
|
||||||
|
if (clean_it == bs->clean_db.end() ||
|
||||||
|
clean_it->second.version < je->big_write.version)
|
||||||
|
{
|
||||||
obj_ver_id ov = {
|
obj_ver_id ov = {
|
||||||
.oid = je->small_write.oid,
|
.oid = je->small_write.oid,
|
||||||
.version = je->small_write.version,
|
.version = je->small_write.version,
|
||||||
|
@ -378,7 +390,12 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t len)
|
||||||
auto & unstab = bs->unstable_writes[ov.oid];
|
auto & unstab = bs->unstable_writes[ov.oid];
|
||||||
unstab = unstab < ov.version ? ov.version : unstab;
|
unstab = unstab < ov.version ? ov.version : unstab;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
else if (je->type == JE_BIG_WRITE)
|
else if (je->type == JE_BIG_WRITE)
|
||||||
|
{
|
||||||
|
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||||
|
if (clean_it == bs->clean_db.end() ||
|
||||||
|
clean_it->second.version < je->big_write.version)
|
||||||
{
|
{
|
||||||
// oid, version, block
|
// oid, version, block
|
||||||
obj_ver_id ov = {
|
obj_ver_id ov = {
|
||||||
|
@ -396,11 +413,15 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t len)
|
||||||
.len = bs->block_size,
|
.len = bs->block_size,
|
||||||
.journal_sector = proc_pos,
|
.journal_sector = proc_pos,
|
||||||
});
|
});
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
|
||||||
|
#endif
|
||||||
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
|
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
|
||||||
bs->journal.used_sectors[proc_pos]++;
|
bs->journal.used_sectors[proc_pos]++;
|
||||||
auto & unstab = bs->unstable_writes[ov.oid];
|
auto & unstab = bs->unstable_writes[ov.oid];
|
||||||
unstab = unstab < ov.version ? ov.version : unstab;
|
unstab = unstab < ov.version ? ov.version : unstab;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
else if (je->type == JE_STABLE)
|
else if (je->type == JE_STABLE)
|
||||||
{
|
{
|
||||||
// oid, version
|
// oid, version
|
||||||
|
|
|
@ -6,7 +6,8 @@ class blockstore_init_meta
|
||||||
int wait_state = 0, wait_count = 0;
|
int wait_state = 0, wait_count = 0;
|
||||||
uint8_t *metadata_buffer = NULL;
|
uint8_t *metadata_buffer = NULL;
|
||||||
uint64_t metadata_read = 0;
|
uint64_t metadata_read = 0;
|
||||||
int prev = 0, prev_done = 0, done_len = 0, submitted = 0, done_cnt = 0;
|
int prev = 0, prev_done = 0, done_len = 0, submitted = 0;
|
||||||
|
uint64_t done_cnt = 0;
|
||||||
uint64_t entries_loaded = 0;
|
uint64_t entries_loaded = 0;
|
||||||
struct io_uring_sqe *sqe;
|
struct io_uring_sqe *sqe;
|
||||||
struct ring_data_t *data;
|
struct ring_data_t *data;
|
||||||
|
|
|
@ -73,6 +73,9 @@ int blockstore::dequeue_write(blockstore_operation *op)
|
||||||
BS_SUBMIT_GET_SQE(sqe, data);
|
BS_SUBMIT_GET_SQE(sqe, data);
|
||||||
dirty_it->second.location = loc << block_order;
|
dirty_it->second.location = loc << block_order;
|
||||||
dirty_it->second.state = ST_D_SUBMITTED;
|
dirty_it->second.state = ST_D_SUBMITTED;
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Allocate block %lu\n", loc);
|
||||||
|
#endif
|
||||||
data_alloc->set(loc, true);
|
data_alloc->set(loc, true);
|
||||||
int vcnt = 0;
|
int vcnt = 0;
|
||||||
if (op->version == 1 && op->len != block_size)
|
if (op->version == 1 && op->len != block_size)
|
||||||
|
|
|
@ -1,6 +1,19 @@
|
||||||
// FIO engine to test Blockstore
|
// FIO engine to test Blockstore
|
||||||
|
//
|
||||||
|
// Random write:
|
||||||
|
//
|
||||||
// fio -thread -ioengine=./libfio_blockstore.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
|
// fio -thread -ioengine=./libfio_blockstore.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
|
||||||
// -data_device=./test_data.bin -meta_device=./test_meta.bin -journal_device=./test_journal.bin -size=1G
|
// -data_device=./test_data.bin -meta_device=./test_meta.bin -journal_device=./test_journal.bin -size=1000M
|
||||||
|
//
|
||||||
|
// Linear write:
|
||||||
|
//
|
||||||
|
// fio -thread -ioengine=./libfio_blockstore.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
|
||||||
|
// -data_device=./test_data.bin -meta_device=./test_meta.bin -journal_device=./test_journal.bin -size=1000M
|
||||||
|
//
|
||||||
|
// Random read (run with -iodepth=32 or -iodepth=1):
|
||||||
|
//
|
||||||
|
// fio -thread -ioengine=./libfio_blockstore.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
|
||||||
|
// -data_device=./test_data.bin -meta_device=./test_meta.bin -journal_device=./test_journal.bin -size=1000M
|
||||||
|
|
||||||
#include "blockstore.h"
|
#include "blockstore.h"
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
Loading…
Reference in New Issue