2019-11-03 22:04:25 +03:00
|
|
|
#include "blockstore.h"
|
|
|
|
|
|
|
|
blockstore_init_meta::blockstore_init_meta(blockstore *bs)
|
|
|
|
{
|
|
|
|
this->bs = bs;
|
|
|
|
}
|
|
|
|
|
2019-11-05 02:43:21 +03:00
|
|
|
void blockstore_init_meta::handle_event(ring_data_t *data)
|
|
|
|
{
|
|
|
|
if (data->res < 0)
|
|
|
|
{
|
|
|
|
throw new std::runtime_error(
|
|
|
|
std::string("read metadata failed at offset ") + std::to_string(metadata_read) +
|
|
|
|
std::string(": ") + strerror(-data->res)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
prev_done = data->res > 0 ? submitted : 0;
|
|
|
|
done_len = data->res;
|
|
|
|
metadata_read += data->res;
|
|
|
|
submitted = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int blockstore_init_meta::loop()
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
|
|
|
if (metadata_read >= bs->meta_len)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (!metadata_buffer)
|
|
|
|
{
|
|
|
|
metadata_buffer = (uint8_t*)memalign(512, 2*bs->metadata_buf_size);
|
|
|
|
}
|
|
|
|
if (!submitted)
|
|
|
|
{
|
2019-11-05 14:10:23 +03:00
|
|
|
struct io_uring_sqe *sqe = bs->get_sqe();
|
2019-11-03 22:04:25 +03:00
|
|
|
if (!sqe)
|
|
|
|
{
|
|
|
|
throw new std::runtime_error("io_uring is full while trying to read metadata");
|
|
|
|
}
|
2019-11-05 02:43:21 +03:00
|
|
|
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
|
|
|
data->iov = {
|
2019-11-03 22:04:25 +03:00
|
|
|
metadata_buffer + (prev == 1 ? bs->metadata_buf_size : 0),
|
|
|
|
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
|
|
|
|
};
|
2019-11-13 21:17:04 +03:00
|
|
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
2019-11-05 02:43:21 +03:00
|
|
|
io_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
|
|
|
|
bs->ringloop->submit();
|
2019-11-03 22:04:25 +03:00
|
|
|
submitted = (prev == 1 ? 2 : 1);
|
|
|
|
prev = submitted;
|
|
|
|
}
|
|
|
|
if (prev_done)
|
|
|
|
{
|
2019-11-14 01:13:07 +03:00
|
|
|
int count = 512 / sizeof(clean_disk_entry);
|
|
|
|
for (int sector = 0; sector < done_len; sector += 512)
|
|
|
|
{
|
|
|
|
clean_disk_entry *entries = (clean_disk_entry*)(metadata_buffer + (prev_done == 1 ? bs->metadata_buf_size : 0) + sector);
|
|
|
|
// handle <count> entries
|
|
|
|
handle_entries(entries, count, bs->block_order);
|
|
|
|
done_cnt += count;
|
|
|
|
}
|
2019-11-03 22:04:25 +03:00
|
|
|
prev_done = 0;
|
|
|
|
done_len = 0;
|
|
|
|
}
|
|
|
|
if (metadata_read >= bs->meta_len)
|
|
|
|
{
|
|
|
|
// metadata read finished
|
|
|
|
free(metadata_buffer);
|
|
|
|
metadata_buffer = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2019-11-14 01:13:07 +03:00
|
|
|
void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, int count, int block_order)
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
2019-11-15 02:03:57 +03:00
|
|
|
auto end = bs->clean_db.end();
|
2019-11-03 22:04:25 +03:00
|
|
|
for (unsigned i = 0; i < count; i++)
|
|
|
|
{
|
|
|
|
if (entries[i].oid.inode > 0)
|
|
|
|
{
|
2019-11-15 02:03:57 +03:00
|
|
|
auto clean_it = bs->clean_db.find(entries[i].oid);
|
|
|
|
if (clean_it == end || clean_it->second.version < entries[i].version)
|
|
|
|
{
|
|
|
|
if (clean_it != end)
|
|
|
|
{
|
|
|
|
// free the previous block
|
|
|
|
allocator_set(bs->data_alloc, clean_it->second.version >> block_order, false);
|
|
|
|
}
|
|
|
|
allocator_set(bs->data_alloc, done_cnt+i, true);
|
|
|
|
bs->clean_db[entries[i].oid] = (struct clean_entry){
|
|
|
|
.version = entries[i].version,
|
|
|
|
.location = (done_cnt+i) << block_order,
|
|
|
|
};
|
|
|
|
}
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
blockstore_init_journal::blockstore_init_journal(blockstore *bs)
|
|
|
|
{
|
|
|
|
this->bs = bs;
|
|
|
|
}
|
|
|
|
|
2019-11-04 01:42:40 +03:00
|
|
|
bool iszero(uint64_t *buf, int len)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < len; i++)
|
|
|
|
if (buf[i] != 0)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-11-05 02:43:21 +03:00
|
|
|
void blockstore_init_journal::handle_event(ring_data_t *data)
|
|
|
|
{
|
|
|
|
if (step == 1)
|
|
|
|
{
|
|
|
|
// Step 1: Read first block of the journal
|
|
|
|
if (data->res < 0)
|
|
|
|
{
|
|
|
|
throw new std::runtime_error(
|
|
|
|
std::string("read journal failed at offset ") + std::to_string(0) +
|
|
|
|
std::string(": ") + strerror(-data->res)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
if (iszero((uint64_t*)journal_buffer, 3))
|
|
|
|
{
|
|
|
|
// Journal is empty
|
2019-11-07 16:58:30 +03:00
|
|
|
// FIXME handle this wrapping to 512 better
|
|
|
|
bs->journal.used_start = 512;
|
|
|
|
bs->journal.next_free = 512;
|
2019-11-05 02:43:21 +03:00
|
|
|
step = 99;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// First block always contains a single JE_START entry
|
|
|
|
journal_entry_start *je = (journal_entry_start*)journal_buffer;
|
|
|
|
if (je->magic != JOURNAL_MAGIC ||
|
|
|
|
je->type != JE_START ||
|
|
|
|
je->size != sizeof(journal_entry_start) ||
|
|
|
|
je_crc32((journal_entry*)je) != je->crc32)
|
|
|
|
{
|
|
|
|
// Entry is corrupt
|
|
|
|
throw new std::runtime_error("first entry of the journal is corrupt");
|
|
|
|
}
|
2019-11-07 16:58:30 +03:00
|
|
|
journal_pos = bs->journal.used_start = je->journal_start;
|
2019-11-05 02:43:21 +03:00
|
|
|
crc32_last = je->crc32_replaced;
|
|
|
|
step = 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (step == 2 || step == 3)
|
|
|
|
{
|
|
|
|
// Step 3: Read journal
|
|
|
|
if (data->res < 0)
|
|
|
|
{
|
|
|
|
throw new std::runtime_error(
|
|
|
|
std::string("read journal failed at offset ") + std::to_string(journal_pos) +
|
|
|
|
std::string(": ") + strerror(-data->res)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
done_pos = journal_pos;
|
|
|
|
done_buf = submitted;
|
|
|
|
done_len = data->res;
|
|
|
|
journal_pos += data->res;
|
2019-11-07 16:58:30 +03:00
|
|
|
if (journal_pos >= bs->journal.len)
|
2019-11-05 02:43:21 +03:00
|
|
|
{
|
|
|
|
// Continue from the beginning
|
|
|
|
journal_pos = 512;
|
|
|
|
wrapped = true;
|
|
|
|
}
|
|
|
|
submitted = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int blockstore_init_journal::loop()
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
if (step == 100)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2019-11-03 22:04:25 +03:00
|
|
|
if (!journal_buffer)
|
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
journal_buffer = (uint8_t*)memalign(DISK_ALIGNMENT, 2*JOURNAL_BUFFER_SIZE);
|
|
|
|
}
|
|
|
|
if (step == 0)
|
|
|
|
{
|
|
|
|
// Step 1: Read first block of the journal
|
2019-11-05 14:10:23 +03:00
|
|
|
struct io_uring_sqe *sqe = bs->get_sqe();
|
2019-11-04 01:42:40 +03:00
|
|
|
if (!sqe)
|
|
|
|
{
|
|
|
|
throw new std::runtime_error("io_uring is full while trying to read journal");
|
|
|
|
}
|
2019-11-05 02:43:21 +03:00
|
|
|
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
|
|
|
data->iov = { journal_buffer, 512 };
|
2019-11-13 21:17:04 +03:00
|
|
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
2019-11-07 16:58:30 +03:00
|
|
|
io_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
|
2019-11-05 02:43:21 +03:00
|
|
|
bs->ringloop->submit();
|
2019-11-04 01:42:40 +03:00
|
|
|
step = 1;
|
|
|
|
}
|
|
|
|
if (step == 2 || step == 3)
|
|
|
|
{
|
|
|
|
// Step 3: Read journal
|
2019-11-04 15:46:33 +03:00
|
|
|
if (!submitted)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
if (step != 3)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-07 16:58:30 +03:00
|
|
|
if (journal_pos == bs->journal.used_start && wrapped)
|
2019-11-04 15:46:33 +03:00
|
|
|
{
|
|
|
|
step = 3;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-11-05 14:10:23 +03:00
|
|
|
struct io_uring_sqe *sqe = bs->get_sqe();
|
2019-11-04 15:46:33 +03:00
|
|
|
if (!sqe)
|
|
|
|
{
|
|
|
|
throw new std::runtime_error("io_uring is full while trying to read journal");
|
|
|
|
}
|
2019-11-05 02:43:21 +03:00
|
|
|
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
2019-11-07 16:58:30 +03:00
|
|
|
uint64_t end = bs->journal.len;
|
|
|
|
if (journal_pos < bs->journal.used_start)
|
2019-11-04 15:46:33 +03:00
|
|
|
{
|
2019-11-07 16:58:30 +03:00
|
|
|
end = bs->journal.used_start;
|
2019-11-04 15:46:33 +03:00
|
|
|
}
|
2019-11-05 02:43:21 +03:00
|
|
|
data->iov = {
|
2019-11-04 15:46:33 +03:00
|
|
|
journal_buffer + (done_buf == 1 ? JOURNAL_BUFFER_SIZE : 0),
|
|
|
|
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
|
|
|
};
|
2019-11-13 21:17:04 +03:00
|
|
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
2019-11-07 16:58:30 +03:00
|
|
|
io_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos);
|
2019-11-05 02:43:21 +03:00
|
|
|
bs->ringloop->submit();
|
2019-11-04 15:46:33 +03:00
|
|
|
submitted = done_buf == 1 ? 2 : 1;
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-04 15:46:33 +03:00
|
|
|
else
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
step = 99;
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (done_buf && step != 3)
|
|
|
|
{
|
|
|
|
// handle journal entries
|
2019-11-04 15:46:33 +03:00
|
|
|
if (handle_journal_part(journal_buffer + (done_buf == 1 ? 0 : JOURNAL_BUFFER_SIZE), done_len) == 0)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
// journal ended. wait for the next read to complete, then stop
|
2019-11-04 01:42:40 +03:00
|
|
|
step = 3;
|
|
|
|
}
|
|
|
|
done_buf = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (step == 99)
|
|
|
|
{
|
|
|
|
free(journal_buffer);
|
2019-11-07 16:58:30 +03:00
|
|
|
bs->journal.crc32_last = crc32_last;
|
2019-11-04 01:42:40 +03:00
|
|
|
journal_buffer = NULL;
|
|
|
|
step = 100;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2019-11-04 15:46:33 +03:00
|
|
|
int blockstore_init_journal::handle_journal_part(void *buf, uint64_t len)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
uint64_t total_pos = 0;
|
|
|
|
if (cur_skip >= 0)
|
|
|
|
{
|
|
|
|
total_pos = cur_skip;
|
|
|
|
cur_skip = 0;
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
while (total_pos < len)
|
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
total_pos += 512;
|
|
|
|
uint64_t pos = 0;
|
2019-11-04 01:42:40 +03:00
|
|
|
while (pos < 512)
|
|
|
|
{
|
|
|
|
journal_entry *je = (journal_entry*)((uint8_t*)buf + total_pos + pos);
|
|
|
|
if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
|
|
|
|
je->type < JE_SMALL_WRITE || je->type > JE_DELETE || je->crc32_prev != crc32_last)
|
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
if (pos == 0)
|
|
|
|
{
|
|
|
|
// invalid entry in the beginning, this is definitely the end of the journal
|
2019-11-07 16:58:30 +03:00
|
|
|
// FIXME handle the edge case when the journal is full
|
|
|
|
bs->journal.next_free = done_pos + total_pos;
|
2019-11-04 15:46:33 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// allow partially filled sectors
|
|
|
|
break;
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-14 21:15:59 +03:00
|
|
|
bs->journal.used_sectors[total_pos]++;
|
2019-11-04 01:42:40 +03:00
|
|
|
pos += je->size;
|
2019-11-04 15:46:33 +03:00
|
|
|
crc32_last = je->crc32;
|
2019-11-04 01:42:40 +03:00
|
|
|
if (je->type == JE_SMALL_WRITE)
|
|
|
|
{
|
|
|
|
// oid, version, offset, len
|
2019-11-04 15:46:33 +03:00
|
|
|
uint64_t location;
|
2019-11-07 16:58:30 +03:00
|
|
|
if (cur_skip > 0 || done_pos + total_pos + je->small_write.len > bs->journal.len)
|
2019-11-04 15:46:33 +03:00
|
|
|
{
|
|
|
|
// data continues from the beginning of the journal
|
|
|
|
location = 512 + cur_skip;
|
|
|
|
cur_skip += je->small_write.len;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// data is right next
|
|
|
|
location = done_pos + total_pos;
|
2019-11-14 21:15:59 +03:00
|
|
|
// FIXME: OOPS. Please don't modify total_pos here
|
2019-11-04 15:46:33 +03:00
|
|
|
total_pos += je->small_write.len;
|
|
|
|
}
|
2019-11-08 00:19:17 +03:00
|
|
|
bs->dirty_db.emplace((obj_ver_id){
|
|
|
|
.oid = je->small_write.oid,
|
2019-11-04 01:42:40 +03:00
|
|
|
.version = je->small_write.version,
|
2019-11-08 00:19:17 +03:00
|
|
|
}, (dirty_entry){
|
2019-11-04 01:42:40 +03:00
|
|
|
.state = ST_J_SYNCED,
|
|
|
|
.flags = 0,
|
2019-11-04 15:46:33 +03:00
|
|
|
.location = location,
|
2019-11-04 01:42:40 +03:00
|
|
|
.offset = je->small_write.offset,
|
2019-11-14 01:13:07 +03:00
|
|
|
.len = je->small_write.len,
|
2019-11-14 21:15:59 +03:00
|
|
|
.journal_sector = total_pos,
|
2019-11-04 01:42:40 +03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
else if (je->type == JE_BIG_WRITE)
|
|
|
|
{
|
|
|
|
// oid, version, block
|
2019-11-08 00:19:17 +03:00
|
|
|
bs->dirty_db.emplace((obj_ver_id){
|
|
|
|
.oid = je->big_write.oid,
|
2019-11-04 01:42:40 +03:00
|
|
|
.version = je->big_write.version,
|
2019-11-08 00:19:17 +03:00
|
|
|
}, (dirty_entry){
|
2019-11-04 01:42:40 +03:00
|
|
|
.state = ST_D_META_SYNCED,
|
|
|
|
.flags = 0,
|
2019-11-12 20:55:17 +03:00
|
|
|
.location = je->big_write.location,
|
2019-11-04 01:42:40 +03:00
|
|
|
.offset = 0,
|
2019-11-14 01:13:07 +03:00
|
|
|
.len = bs->block_size,
|
2019-11-14 21:15:59 +03:00
|
|
|
.journal_sector = total_pos,
|
2019-11-04 01:42:40 +03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
else if (je->type == JE_STABLE)
|
|
|
|
{
|
|
|
|
// oid, version
|
2019-11-08 00:19:17 +03:00
|
|
|
auto it = bs->dirty_db.find((obj_ver_id){
|
|
|
|
.oid = je->stable.oid,
|
|
|
|
.version = je->stable.version,
|
|
|
|
});
|
|
|
|
if (it == bs->dirty_db.end())
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
// journal contains a legitimate STABLE entry for a non-existing dirty write
|
|
|
|
// this probably means that journal was trimmed between WRITTEN and STABLE entries
|
|
|
|
// skip for now. but FIXME: maybe warn about it in the future
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-11-08 00:19:17 +03:00
|
|
|
it->second.state = (it->second.state == ST_D_META_SYNCED
|
|
|
|
? ST_D_STABLE
|
|
|
|
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (je->type == JE_DELETE)
|
|
|
|
{
|
|
|
|
// oid, version
|
2019-11-08 00:19:17 +03:00
|
|
|
bs->dirty_db.emplace((obj_ver_id){
|
|
|
|
.oid = je->del.oid,
|
|
|
|
.version = je->del.version,
|
|
|
|
}, (dirty_entry){
|
2019-11-04 15:46:33 +03:00
|
|
|
.state = ST_DEL_SYNCED,
|
|
|
|
.flags = 0,
|
|
|
|
.location = 0,
|
|
|
|
.offset = 0,
|
2019-11-14 01:13:07 +03:00
|
|
|
.len = 0,
|
2019-11-14 21:15:59 +03:00
|
|
|
.journal_sector = total_pos,
|
2019-11-04 15:46:33 +03:00
|
|
|
});
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
}
|
2019-11-04 15:46:33 +03:00
|
|
|
}
|
|
|
|
if (cur_skip == 0 && total_pos > len)
|
|
|
|
{
|
|
|
|
cur_skip = total_pos - len;
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
return 1;
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|