2019-12-15 14:49:10 +03:00
|
|
|
#include "blockstore_impl.h"
|
2019-11-03 22:04:25 +03:00
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
|
|
|
this->bs = bs;
|
|
|
|
}
|
|
|
|
|
2019-11-05 02:43:21 +03:00
|
|
|
void blockstore_init_meta::handle_event(ring_data_t *data)
|
|
|
|
{
|
2019-11-17 22:27:29 +03:00
|
|
|
if (data->res <= 0)
|
2019-11-05 02:43:21 +03:00
|
|
|
{
|
2019-11-16 02:12:27 +03:00
|
|
|
throw std::runtime_error(
|
2019-11-05 02:43:21 +03:00
|
|
|
std::string("read metadata failed at offset ") + std::to_string(metadata_read) +
|
|
|
|
std::string(": ") + strerror(-data->res)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
prev_done = data->res > 0 ? submitted : 0;
|
|
|
|
done_len = data->res;
|
2019-12-02 02:44:56 +03:00
|
|
|
done_pos = metadata_read;
|
2019-11-05 02:43:21 +03:00
|
|
|
metadata_read += data->res;
|
|
|
|
submitted = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int blockstore_init_meta::loop()
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
2019-11-21 21:51:52 +03:00
|
|
|
if (wait_state == 1)
|
|
|
|
goto resume_1;
|
2019-11-29 02:13:30 +03:00
|
|
|
printf("Reading blockstore metadata\n");
|
2019-12-02 02:44:56 +03:00
|
|
|
if (bs->inmemory_meta)
|
|
|
|
metadata_buffer = bs->metadata_buffer;
|
|
|
|
else
|
2020-01-06 14:11:47 +03:00
|
|
|
metadata_buffer = memalign(MEM_ALIGNMENT, 2*bs->metadata_buf_size);
|
2019-11-03 22:04:25 +03:00
|
|
|
if (!metadata_buffer)
|
2019-12-02 02:44:56 +03:00
|
|
|
throw std::runtime_error("Failed to allocate metadata read buffer");
|
2019-11-21 21:51:52 +03:00
|
|
|
while (1)
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
2019-11-21 21:51:52 +03:00
|
|
|
resume_1:
|
|
|
|
if (submitted)
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
2019-11-21 21:51:52 +03:00
|
|
|
wait_state = 1;
|
|
|
|
return 1;
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|
2019-11-21 21:51:52 +03:00
|
|
|
if (metadata_read < bs->meta_len)
|
|
|
|
{
|
|
|
|
sqe = bs->get_sqe();
|
|
|
|
if (!sqe)
|
|
|
|
{
|
|
|
|
throw std::runtime_error("io_uring is full while trying to read metadata");
|
|
|
|
}
|
|
|
|
data = ((ring_data_t*)sqe->user_data);
|
|
|
|
data->iov = {
|
2019-12-02 02:44:56 +03:00
|
|
|
metadata_buffer + (bs->inmemory_meta
|
|
|
|
? metadata_read
|
|
|
|
: (prev == 1 ? bs->metadata_buf_size : 0)),
|
2019-11-21 21:51:52 +03:00
|
|
|
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
|
|
|
|
};
|
|
|
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_readv(sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + metadata_read);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2019-11-21 21:51:52 +03:00
|
|
|
bs->ringloop->submit();
|
|
|
|
submitted = (prev == 1 ? 2 : 1);
|
|
|
|
prev = submitted;
|
|
|
|
}
|
|
|
|
if (prev_done)
|
|
|
|
{
|
2019-12-02 02:44:56 +03:00
|
|
|
void *done_buf = bs->inmemory_meta
|
|
|
|
? (metadata_buffer + done_pos)
|
|
|
|
: (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
|
2020-01-16 00:35:35 +03:00
|
|
|
unsigned count = bs->meta_block_size / bs->clean_entry_size;
|
|
|
|
for (int sector = 0; sector < done_len; sector += bs->meta_block_size)
|
2019-11-21 21:51:52 +03:00
|
|
|
{
|
|
|
|
// handle <count> entries
|
2020-01-12 02:11:09 +03:00
|
|
|
handle_entries(done_buf + sector, count, bs->block_order);
|
2019-11-21 21:51:52 +03:00
|
|
|
done_cnt += count;
|
|
|
|
}
|
|
|
|
prev_done = 0;
|
|
|
|
done_len = 0;
|
|
|
|
}
|
|
|
|
if (!submitted)
|
2019-11-14 01:13:07 +03:00
|
|
|
{
|
2019-11-21 21:51:52 +03:00
|
|
|
break;
|
2019-11-14 01:13:07 +03:00
|
|
|
}
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|
2019-11-21 21:51:52 +03:00
|
|
|
// metadata read finished
|
2019-11-28 20:23:26 +03:00
|
|
|
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count);
|
2019-12-02 02:44:56 +03:00
|
|
|
if (!bs->inmemory_meta)
|
|
|
|
{
|
|
|
|
free(metadata_buffer);
|
|
|
|
metadata_buffer = NULL;
|
|
|
|
}
|
2019-11-21 21:51:52 +03:00
|
|
|
return 0;
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|
|
|
|
|
2020-01-12 02:11:09 +03:00
|
|
|
void blockstore_init_meta::handle_entries(void* entries, unsigned count, int block_order)
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
|
|
|
for (unsigned i = 0; i < count; i++)
|
|
|
|
{
|
2020-01-12 02:11:09 +03:00
|
|
|
clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
|
|
|
|
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
2020-01-12 02:11:09 +03:00
|
|
|
memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size);
|
|
|
|
}
|
|
|
|
if (entry->oid.inode > 0)
|
|
|
|
{
|
|
|
|
auto clean_it = bs->clean_db.find(entry->oid);
|
|
|
|
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
|
2019-11-15 02:03:57 +03:00
|
|
|
{
|
2019-11-26 12:06:42 +03:00
|
|
|
if (clean_it != bs->clean_db.end())
|
2019-11-15 02:03:57 +03:00
|
|
|
{
|
|
|
|
// free the previous block
|
2019-11-28 22:36:38 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("Free block %lu\n", clean_it->second.location >> bs->block_order);
|
|
|
|
#endif
|
2019-11-28 20:23:26 +03:00
|
|
|
bs->data_alloc->set(clean_it->second.location >> block_order, false);
|
2019-11-15 02:03:57 +03:00
|
|
|
}
|
2019-11-28 20:23:26 +03:00
|
|
|
entries_loaded++;
|
2019-11-28 22:36:38 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
2020-01-12 02:11:09 +03:00
|
|
|
printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
2019-11-28 22:36:38 +03:00
|
|
|
#endif
|
2019-11-27 00:50:57 +03:00
|
|
|
bs->data_alloc->set(done_cnt+i, true);
|
2020-01-12 02:11:09 +03:00
|
|
|
bs->clean_db[entry->oid] = (struct clean_entry){
|
|
|
|
.version = entry->version,
|
2019-11-15 02:03:57 +03:00
|
|
|
.location = (done_cnt+i) << block_order,
|
|
|
|
};
|
|
|
|
}
|
2019-11-28 22:36:38 +03:00
|
|
|
else
|
2019-12-02 02:11:23 +03:00
|
|
|
{
|
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
2020-01-12 02:11:09 +03:00
|
|
|
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
2019-11-28 22:36:38 +03:00
|
|
|
#endif
|
2019-12-02 02:11:23 +03:00
|
|
|
}
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
|
|
|
this->bs = bs;
|
2020-01-16 00:35:35 +03:00
|
|
|
next_free = bs->journal.block_size;
|
2019-11-19 20:03:19 +03:00
|
|
|
simple_callback = [this](ring_data_t *data1)
|
|
|
|
{
|
|
|
|
if (data1->res != data1->iov.iov_len)
|
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("I/O operation failed while reading journal: ") + strerror(-data1->res));
|
|
|
|
}
|
|
|
|
wait_count--;
|
|
|
|
};
|
2019-11-03 22:04:25 +03:00
|
|
|
}
|
|
|
|
|
2019-11-04 01:42:40 +03:00
|
|
|
bool iszero(uint64_t *buf, int len)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < len; i++)
|
|
|
|
if (buf[i] != 0)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-11-19 19:49:52 +03:00
|
|
|
void blockstore_init_journal::handle_event(ring_data_t *data1)
|
2019-11-05 02:43:21 +03:00
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
if (data1->res <= 0)
|
2019-11-05 02:43:21 +03:00
|
|
|
{
|
2019-11-19 19:49:52 +03:00
|
|
|
throw std::runtime_error(
|
|
|
|
std::string("read journal failed at offset ") + std::to_string(journal_pos) +
|
|
|
|
std::string(": ") + strerror(-data1->res)
|
|
|
|
);
|
2019-11-05 02:43:21 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
done.push_back({
|
|
|
|
.buf = submitted_buf,
|
|
|
|
.pos = journal_pos,
|
|
|
|
.len = (uint64_t)data1->res,
|
|
|
|
});
|
2019-11-19 19:49:52 +03:00
|
|
|
journal_pos += data1->res;
|
|
|
|
if (journal_pos >= bs->journal.len)
|
2019-11-05 02:43:21 +03:00
|
|
|
{
|
2019-11-19 19:49:52 +03:00
|
|
|
// Continue from the beginning
|
2020-01-16 00:35:35 +03:00
|
|
|
journal_pos = bs->journal.block_size;
|
2019-11-19 19:49:52 +03:00
|
|
|
wrapped = true;
|
2019-11-05 02:43:21 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
submitted_buf = NULL;
|
2019-11-05 02:43:21 +03:00
|
|
|
}
|
|
|
|
|
2019-11-19 20:03:19 +03:00
|
|
|
#define GET_SQE() \
|
|
|
|
sqe = bs->get_sqe();\
|
|
|
|
if (!sqe)\
|
|
|
|
throw std::runtime_error("io_uring is full while trying to read journal");\
|
|
|
|
data = ((ring_data_t*)sqe->user_data)
|
|
|
|
|
2019-11-05 02:43:21 +03:00
|
|
|
int blockstore_init_journal::loop()
|
2019-11-03 22:04:25 +03:00
|
|
|
{
|
2019-11-19 19:49:52 +03:00
|
|
|
if (wait_state == 1)
|
|
|
|
goto resume_1;
|
|
|
|
else if (wait_state == 2)
|
|
|
|
goto resume_2;
|
|
|
|
else if (wait_state == 3)
|
|
|
|
goto resume_3;
|
2019-11-19 20:03:19 +03:00
|
|
|
else if (wait_state == 4)
|
|
|
|
goto resume_4;
|
2019-11-30 23:32:10 +03:00
|
|
|
else if (wait_state == 5)
|
|
|
|
goto resume_5;
|
2020-01-28 22:45:45 +03:00
|
|
|
else if (wait_state == 6)
|
|
|
|
goto resume_6;
|
|
|
|
else if (wait_state == 7)
|
|
|
|
goto resume_7;
|
2019-11-29 02:13:30 +03:00
|
|
|
printf("Reading blockstore journal\n");
|
2019-11-28 14:41:03 +03:00
|
|
|
if (!bs->journal.inmemory)
|
|
|
|
{
|
2020-01-16 00:35:35 +03:00
|
|
|
submitted_buf = memalign(MEM_ALIGNMENT, 2*bs->journal.block_size);
|
2019-11-30 23:32:10 +03:00
|
|
|
if (!submitted_buf)
|
2019-11-28 14:41:03 +03:00
|
|
|
throw std::bad_alloc();
|
|
|
|
}
|
|
|
|
else
|
2019-11-30 23:32:10 +03:00
|
|
|
submitted_buf = bs->journal.buffer;
|
2019-11-19 19:49:52 +03:00
|
|
|
// Read first block of the journal
|
|
|
|
sqe = bs->get_sqe();
|
|
|
|
if (!sqe)
|
|
|
|
throw std::runtime_error("io_uring is full while trying to read journal");
|
|
|
|
data = ((ring_data_t*)sqe->user_data);
|
2020-01-16 00:35:35 +03:00
|
|
|
data->iov = { submitted_buf, bs->journal.block_size };
|
2019-11-19 20:03:19 +03:00
|
|
|
data->callback = simple_callback;
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2019-11-19 19:49:52 +03:00
|
|
|
bs->ringloop->submit();
|
2019-11-19 20:03:19 +03:00
|
|
|
wait_count = 1;
|
2019-11-19 19:49:52 +03:00
|
|
|
resume_1:
|
2019-11-19 20:03:19 +03:00
|
|
|
if (wait_count > 0)
|
2019-11-19 19:49:52 +03:00
|
|
|
{
|
|
|
|
wait_state = 1;
|
|
|
|
return 1;
|
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
if (iszero((uint64_t*)submitted_buf, 3))
|
2019-11-19 19:49:52 +03:00
|
|
|
{
|
|
|
|
// Journal is empty
|
2020-01-16 00:35:35 +03:00
|
|
|
// FIXME handle this wrapping to journal_block_size better (maybe)
|
|
|
|
bs->journal.used_start = bs->journal.block_size;
|
|
|
|
bs->journal.next_free = bs->journal.block_size;
|
2019-11-21 22:04:44 +03:00
|
|
|
// Initialize journal "superblock" and the first block
|
2020-01-16 00:35:35 +03:00
|
|
|
memset(submitted_buf, 0, 2*bs->journal.block_size);
|
2019-11-30 23:32:10 +03:00
|
|
|
*((journal_entry_start*)submitted_buf) = {
|
2019-11-19 20:03:19 +03:00
|
|
|
.crc32 = 0,
|
|
|
|
.magic = JOURNAL_MAGIC,
|
|
|
|
.type = JE_START,
|
|
|
|
.size = sizeof(journal_entry_start),
|
|
|
|
.reserved = 0,
|
2020-01-16 00:35:35 +03:00
|
|
|
.journal_start = bs->journal.block_size,
|
2019-11-19 20:03:19 +03:00
|
|
|
};
|
2019-11-30 23:32:10 +03:00
|
|
|
((journal_entry_start*)submitted_buf)->crc32 = je_crc32((journal_entry*)submitted_buf);
|
2019-11-30 23:39:10 +03:00
|
|
|
if (bs->readonly)
|
2019-11-19 20:03:19 +03:00
|
|
|
{
|
2019-11-30 23:39:10 +03:00
|
|
|
printf("Skipping journal initialization because blockstore is readonly\n");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Cool effect. Same operations result in journal replay.
|
|
|
|
// FIXME: Randomize initial crc32. Track crc32 when trimming.
|
2020-01-28 22:45:45 +03:00
|
|
|
printf("Resetting journal\n");
|
2019-11-30 23:39:10 +03:00
|
|
|
GET_SQE();
|
2020-01-16 00:35:35 +03:00
|
|
|
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
|
2019-11-30 23:39:10 +03:00
|
|
|
data->callback = simple_callback;
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2019-11-30 23:39:10 +03:00
|
|
|
wait_count++;
|
2020-01-28 22:45:45 +03:00
|
|
|
bs->ringloop->submit();
|
|
|
|
resume_6:
|
|
|
|
if (wait_count > 0)
|
|
|
|
{
|
|
|
|
wait_state = 6;
|
|
|
|
return 1;
|
|
|
|
}
|
2020-01-17 13:40:47 +03:00
|
|
|
if (!bs->disable_journal_fsync)
|
2019-11-30 23:55:30 +03:00
|
|
|
{
|
|
|
|
GET_SQE();
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2019-11-30 23:55:30 +03:00
|
|
|
data->iov = { 0 };
|
|
|
|
data->callback = simple_callback;
|
|
|
|
wait_count++;
|
2020-01-28 22:45:45 +03:00
|
|
|
bs->ringloop->submit();
|
2019-11-30 23:55:30 +03:00
|
|
|
}
|
2019-11-30 23:39:10 +03:00
|
|
|
resume_4:
|
|
|
|
if (wait_count > 0)
|
|
|
|
{
|
|
|
|
wait_state = 4;
|
|
|
|
return 1;
|
|
|
|
}
|
2019-11-19 20:03:19 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
if (!bs->journal.inmemory)
|
2019-11-30 23:39:10 +03:00
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
free(submitted_buf);
|
2019-11-30 23:39:10 +03:00
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-19 19:49:52 +03:00
|
|
|
else
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-19 19:49:52 +03:00
|
|
|
// First block always contains a single JE_START entry
|
2019-11-30 23:32:10 +03:00
|
|
|
je_start = (journal_entry_start*)submitted_buf;
|
2019-11-19 19:49:52 +03:00
|
|
|
if (je_start->magic != JOURNAL_MAGIC ||
|
|
|
|
je_start->type != JE_START ||
|
|
|
|
je_start->size != sizeof(journal_entry_start) ||
|
|
|
|
je_crc32((journal_entry*)je_start) != je_start->crc32)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-19 19:49:52 +03:00
|
|
|
// Entry is corrupt
|
|
|
|
throw std::runtime_error("first entry of the journal is corrupt");
|
|
|
|
}
|
2019-11-30 02:27:31 +03:00
|
|
|
next_free = journal_pos = bs->journal.used_start = je_start->journal_start;
|
2019-11-30 23:32:10 +03:00
|
|
|
if (!bs->journal.inmemory)
|
|
|
|
free(submitted_buf);
|
|
|
|
submitted_buf = NULL;
|
2019-11-19 19:49:52 +03:00
|
|
|
crc32_last = 0;
|
|
|
|
// Read journal
|
2019-11-21 23:45:19 +03:00
|
|
|
while (1)
|
2019-11-19 19:49:52 +03:00
|
|
|
{
|
|
|
|
resume_2:
|
2019-11-30 23:32:10 +03:00
|
|
|
if (submitted_buf)
|
2019-11-19 19:49:52 +03:00
|
|
|
{
|
|
|
|
wait_state = 2;
|
|
|
|
return 1;
|
|
|
|
}
|
2019-11-20 00:46:44 +03:00
|
|
|
if (!wrapped || journal_pos < bs->journal.used_start)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-19 20:03:19 +03:00
|
|
|
GET_SQE();
|
2019-11-19 19:49:52 +03:00
|
|
|
uint64_t end = bs->journal.len;
|
|
|
|
if (journal_pos < bs->journal.used_start)
|
|
|
|
end = bs->journal.used_start;
|
2019-11-30 23:32:10 +03:00
|
|
|
if (!bs->journal.inmemory)
|
2020-01-06 14:11:47 +03:00
|
|
|
submitted_buf = memalign(MEM_ALIGNMENT, JOURNAL_BUFFER_SIZE);
|
2019-11-30 23:32:10 +03:00
|
|
|
else
|
|
|
|
submitted_buf = bs->journal.buffer + journal_pos;
|
2019-11-19 19:49:52 +03:00
|
|
|
data->iov = {
|
2019-11-30 23:32:10 +03:00
|
|
|
submitted_buf,
|
2019-11-19 19:49:52 +03:00
|
|
|
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
|
|
|
};
|
|
|
|
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + journal_pos);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2019-11-19 19:49:52 +03:00
|
|
|
bs->ringloop->submit();
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
while (done.size() > 0)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
handle_res = handle_journal_part(done[0].buf, done[0].pos, done[0].len);
|
|
|
|
if (handle_res == 0)
|
|
|
|
{
|
|
|
|
// journal ended
|
|
|
|
// zero out corrupted entry, if required
|
2019-11-30 23:39:10 +03:00
|
|
|
if (init_write_buf && !bs->readonly)
|
2019-11-30 23:32:10 +03:00
|
|
|
{
|
|
|
|
GET_SQE();
|
2020-01-16 00:35:35 +03:00
|
|
|
data->iov = { init_write_buf, bs->journal.block_size };
|
2019-11-30 23:32:10 +03:00
|
|
|
data->callback = simple_callback;
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + init_write_sector);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2020-01-28 22:45:45 +03:00
|
|
|
wait_count++;
|
|
|
|
bs->ringloop->submit();
|
|
|
|
resume_7:
|
|
|
|
if (wait_count > 0)
|
|
|
|
{
|
|
|
|
wait_state = 7;
|
|
|
|
return 1;
|
|
|
|
}
|
2020-01-17 13:40:47 +03:00
|
|
|
if (!bs->disable_journal_fsync)
|
2019-11-30 23:55:30 +03:00
|
|
|
{
|
|
|
|
GET_SQE();
|
|
|
|
data->iov = { 0 };
|
|
|
|
data->callback = simple_callback;
|
2020-03-03 17:22:40 +03:00
|
|
|
my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC);
|
|
|
|
sqe->flags |= IOSQE_FIXED_FILE;
|
2020-01-28 22:45:45 +03:00
|
|
|
wait_count++;
|
|
|
|
bs->ringloop->submit();
|
2019-11-30 23:55:30 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
resume_5:
|
|
|
|
if (wait_count > 0)
|
|
|
|
{
|
|
|
|
wait_state = 5;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// wait for the next read to complete, then stop
|
|
|
|
resume_3:
|
|
|
|
if (submitted_buf)
|
|
|
|
{
|
|
|
|
wait_state = 3;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
// free buffers
|
|
|
|
if (!bs->journal.inmemory)
|
|
|
|
for (auto & e: done)
|
|
|
|
free(e.buf);
|
|
|
|
done.clear();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (handle_res == 1)
|
2019-11-19 19:49:52 +03:00
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
// OK, remove it
|
|
|
|
if (!bs->journal.inmemory)
|
|
|
|
{
|
|
|
|
free(done[0].buf);
|
|
|
|
}
|
|
|
|
done.erase(done.begin());
|
|
|
|
}
|
|
|
|
else if (handle_res == 2)
|
|
|
|
{
|
|
|
|
// Need to wait for more reads
|
|
|
|
break;
|
2019-11-19 19:49:52 +03:00
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
if (!submitted_buf)
|
2019-11-20 00:46:44 +03:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
}
|
2019-11-29 02:13:30 +03:00
|
|
|
// Trim journal on start so we don't stall when all entries are older
|
|
|
|
bs->journal.trim();
|
2020-02-26 18:32:00 +03:00
|
|
|
printf(
|
|
|
|
"Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
|
|
|
|
entries_loaded,
|
|
|
|
(bs->journal.next_free >= bs->journal.used_start
|
|
|
|
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
|
|
|
|
: bs->journal.used_start - bs->journal.next_free),
|
|
|
|
bs->journal.used_start, bs->journal.next_free,
|
|
|
|
bs->data_alloc->get_free_count(), bs->block_count
|
|
|
|
);
|
2019-11-19 19:49:52 +03:00
|
|
|
bs->journal.crc32_last = crc32_last;
|
|
|
|
return 0;
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
|
2019-11-30 23:32:10 +03:00
|
|
|
int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, uint64_t len)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
uint64_t proc_pos, pos;
|
|
|
|
if (continue_pos != 0)
|
|
|
|
{
|
2020-01-16 00:35:35 +03:00
|
|
|
proc_pos = (continue_pos / bs->journal.block_size) * bs->journal.block_size;
|
|
|
|
pos = continue_pos % bs->journal.block_size;
|
2019-11-30 23:32:10 +03:00
|
|
|
continue_pos = 0;
|
|
|
|
goto resume;
|
|
|
|
}
|
2019-11-30 02:27:31 +03:00
|
|
|
while (next_free >= done_pos && next_free < done_pos+len)
|
2019-11-04 15:46:33 +03:00
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
proc_pos = next_free;
|
|
|
|
pos = 0;
|
2020-01-16 00:35:35 +03:00
|
|
|
next_free += bs->journal.block_size;
|
2019-11-30 02:27:31 +03:00
|
|
|
if (next_free >= bs->journal.len)
|
|
|
|
{
|
2020-01-16 00:35:35 +03:00
|
|
|
next_free = bs->journal.block_size;
|
2019-11-30 02:27:31 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
resume:
|
2020-01-16 00:35:35 +03:00
|
|
|
while (pos < bs->journal.block_size)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-12-02 02:44:56 +03:00
|
|
|
journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
|
2019-11-04 01:42:40 +03:00
|
|
|
if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
|
2019-11-15 14:09:41 +03:00
|
|
|
je->type < JE_SMALL_WRITE || je->type > JE_DELETE || started && je->crc32_prev != crc32_last)
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
if (pos == 0)
|
|
|
|
{
|
|
|
|
// invalid entry in the beginning, this is definitely the end of the journal
|
2020-01-12 19:48:03 +03:00
|
|
|
bs->journal.next_free = proc_pos;
|
2019-11-04 15:46:33 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// allow partially filled sectors
|
|
|
|
break;
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
if (je->type == JE_SMALL_WRITE)
|
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("je_small_write oid=%lu:%lu ver=%lu offset=%u len=%u\n", je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version, je->small_write.offset, je->small_write.len);
|
|
|
|
#endif
|
2019-11-04 01:42:40 +03:00
|
|
|
// oid, version, offset, len
|
2019-11-30 23:32:10 +03:00
|
|
|
uint64_t prev_free = next_free;
|
2019-11-30 02:27:31 +03:00
|
|
|
if (next_free + je->small_write.len > bs->journal.len)
|
2019-11-04 15:46:33 +03:00
|
|
|
{
|
|
|
|
// data continues from the beginning of the journal
|
2020-01-16 00:35:35 +03:00
|
|
|
next_free = bs->journal.block_size;
|
2019-11-04 15:46:33 +03:00
|
|
|
}
|
2019-11-30 02:27:31 +03:00
|
|
|
uint64_t location = next_free;
|
|
|
|
next_free += je->small_write.len;
|
|
|
|
if (next_free >= bs->journal.len)
|
2019-11-04 15:46:33 +03:00
|
|
|
{
|
2020-01-16 00:35:35 +03:00
|
|
|
next_free = bs->journal.block_size;
|
2019-11-20 00:46:44 +03:00
|
|
|
}
|
|
|
|
if (location != je->small_write.data_offset)
|
|
|
|
{
|
2019-11-27 19:39:15 +03:00
|
|
|
char err[1024];
|
|
|
|
snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
|
|
|
|
throw std::runtime_error(err);
|
2019-11-04 15:46:33 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
uint32_t data_crc32 = 0;
|
|
|
|
if (location >= done_pos && location+je->small_write.len <= done_pos+len)
|
|
|
|
{
|
|
|
|
// data is within this buffer
|
|
|
|
data_crc32 = crc32c(0, buf + location - done_pos, je->small_write.len);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// this case is even more interesting because we must carry data crc32 check to next buffer(s)
|
|
|
|
uint64_t covered = 0;
|
|
|
|
for (int i = 0; i < done.size(); i++)
|
|
|
|
{
|
|
|
|
if (location+je->small_write.len > done[i].pos &&
|
|
|
|
location < done[i].pos+done[i].len)
|
|
|
|
{
|
|
|
|
uint64_t part_end = (location+je->small_write.len < done[i].pos+done[i].len
|
|
|
|
? location+je->small_write.len : done[i].pos+done[i].len);
|
|
|
|
uint64_t part_begin = (location < done[i].pos ? done[i].pos : location);
|
|
|
|
covered += part_end - part_begin;
|
|
|
|
data_crc32 = crc32c(data_crc32, done[i].buf + part_begin - done[i].pos, part_end - part_begin);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (covered < je->small_write.len)
|
|
|
|
{
|
|
|
|
continue_pos = proc_pos+pos;
|
|
|
|
next_free = prev_free;
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (data_crc32 != je->small_write.crc32_data)
|
|
|
|
{
|
|
|
|
// journal entry is corrupt, stop here
|
|
|
|
// interesting thing is that we must clear the corrupt entry if we're not readonly
|
2020-01-16 00:35:35 +03:00
|
|
|
memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
|
2019-11-30 23:32:10 +03:00
|
|
|
bs->journal.next_free = prev_free;
|
|
|
|
init_write_buf = buf + proc_pos - done_pos;
|
|
|
|
init_write_sector = proc_pos;
|
|
|
|
return 0;
|
|
|
|
}
|
2019-11-28 22:36:38 +03:00
|
|
|
auto clean_it = bs->clean_db.find(je->small_write.oid);
|
|
|
|
if (clean_it == bs->clean_db.end() ||
|
|
|
|
clean_it->second.version < je->big_write.version)
|
|
|
|
{
|
|
|
|
obj_ver_id ov = {
|
|
|
|
.oid = je->small_write.oid,
|
|
|
|
.version = je->small_write.version,
|
|
|
|
};
|
|
|
|
bs->dirty_db.emplace(ov, (dirty_entry){
|
|
|
|
.state = ST_J_SYNCED,
|
|
|
|
.flags = 0,
|
|
|
|
.location = location,
|
|
|
|
.offset = je->small_write.offset,
|
|
|
|
.len = je->small_write.len,
|
|
|
|
.journal_sector = proc_pos,
|
|
|
|
});
|
|
|
|
bs->journal.used_sectors[proc_pos]++;
|
2019-11-28 00:26:53 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
2019-11-28 22:36:38 +03:00
|
|
|
printf("journal offset %lu is used by %lu:%lu v%lu\n", proc_pos, ov.oid.inode, ov.oid.stripe, ov.version);
|
2019-11-28 00:26:53 +03:00
|
|
|
#endif
|
2019-11-28 22:36:38 +03:00
|
|
|
auto & unstab = bs->unstable_writes[ov.oid];
|
|
|
|
unstab = unstab < ov.version ? ov.version : unstab;
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
else if (je->type == JE_BIG_WRITE)
|
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
2020-02-25 01:20:45 +03:00
|
|
|
printf("je_big_write oid=%lu:%lu ver=%lu loc=%lu\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
|
2019-11-30 23:32:10 +03:00
|
|
|
#endif
|
2019-11-28 22:36:38 +03:00
|
|
|
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
|
|
|
if (clean_it == bs->clean_db.end() ||
|
|
|
|
clean_it->second.version < je->big_write.version)
|
|
|
|
{
|
|
|
|
// oid, version, block
|
|
|
|
obj_ver_id ov = {
|
|
|
|
.oid = je->big_write.oid,
|
|
|
|
.version = je->big_write.version,
|
|
|
|
};
|
|
|
|
bs->dirty_db.emplace(ov, (dirty_entry){
|
|
|
|
.state = ST_D_META_SYNCED,
|
|
|
|
.flags = 0,
|
|
|
|
.location = je->big_write.location,
|
2020-01-12 19:48:03 +03:00
|
|
|
.offset = je->big_write.offset,
|
|
|
|
.len = je->big_write.len,
|
2019-11-28 22:36:38 +03:00
|
|
|
.journal_sector = proc_pos,
|
|
|
|
});
|
2019-11-28 02:27:17 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
2019-11-28 22:36:38 +03:00
|
|
|
printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
|
2019-11-28 02:27:17 +03:00
|
|
|
#endif
|
2019-11-28 22:36:38 +03:00
|
|
|
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
|
|
|
|
bs->journal.used_sectors[proc_pos]++;
|
|
|
|
auto & unstab = bs->unstable_writes[ov.oid];
|
|
|
|
unstab = unstab < ov.version ? ov.version : unstab;
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
else if (je->type == JE_STABLE)
|
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("je_stable oid=%lu:%lu ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
|
|
|
|
#endif
|
2019-11-04 01:42:40 +03:00
|
|
|
// oid, version
|
2019-11-15 14:09:41 +03:00
|
|
|
obj_ver_id ov = {
|
2019-11-08 00:19:17 +03:00
|
|
|
.oid = je->stable.oid,
|
|
|
|
.version = je->stable.version,
|
2019-11-15 14:09:41 +03:00
|
|
|
};
|
|
|
|
auto it = bs->dirty_db.find(ov);
|
2019-11-08 00:19:17 +03:00
|
|
|
if (it == bs->dirty_db.end())
|
2019-11-04 01:42:40 +03:00
|
|
|
{
|
2019-11-04 15:46:33 +03:00
|
|
|
// journal contains a legitimate STABLE entry for a non-existing dirty write
|
2020-01-24 20:10:18 +03:00
|
|
|
// this probably means that journal was trimmed between WRITE and STABLE entries
|
2019-11-30 23:32:10 +03:00
|
|
|
// skip it
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-11-21 02:09:12 +03:00
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
it->second.state = (it->second.state == ST_D_META_SYNCED
|
|
|
|
? ST_D_STABLE
|
|
|
|
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
|
|
|
|
if (it == bs->dirty_db.begin())
|
|
|
|
break;
|
|
|
|
it--;
|
|
|
|
if (it->first.oid != ov.oid || IS_STABLE(it->second.state))
|
|
|
|
break;
|
|
|
|
}
|
2019-11-28 00:26:53 +03:00
|
|
|
bs->flusher->enqueue_flush(ov);
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-25 01:16:34 +03:00
|
|
|
auto unstab_it = bs->unstable_writes.find(ov.oid);
|
|
|
|
if (unstab_it != bs->unstable_writes.end() && unstab_it->second <= ov.version)
|
|
|
|
{
|
|
|
|
bs->unstable_writes.erase(unstab_it);
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2020-01-24 20:10:18 +03:00
|
|
|
else if (je->type == JE_ROLLBACK)
|
|
|
|
{
|
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
|
|
|
|
#endif
|
|
|
|
// rollback dirty writes of <oid> up to <version>
|
|
|
|
auto it = bs->dirty_db.lower_bound((obj_ver_id){
|
|
|
|
.oid = je->rollback.oid,
|
|
|
|
.version = UINT64_MAX,
|
|
|
|
});
|
|
|
|
if (it != bs->dirty_db.begin())
|
|
|
|
{
|
|
|
|
uint64_t max_unstable = 0;
|
|
|
|
auto rm_start = it;
|
|
|
|
auto rm_end = it;
|
|
|
|
it--;
|
|
|
|
while (it->first.oid == je->rollback.oid &&
|
|
|
|
it->first.version > je->rollback.version &&
|
|
|
|
!IS_IN_FLIGHT(it->second.state) &&
|
|
|
|
!IS_STABLE(it->second.state))
|
|
|
|
{
|
|
|
|
if (it->first.oid != je->rollback.oid)
|
|
|
|
break;
|
|
|
|
else if (it->first.version <= je->rollback.version)
|
|
|
|
{
|
|
|
|
if (!IS_STABLE(it->second.state))
|
|
|
|
max_unstable = it->first.version;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (IS_STABLE(it->second.state))
|
|
|
|
break;
|
|
|
|
// Remove entry
|
|
|
|
rm_start = it;
|
|
|
|
if (it == bs->dirty_db.begin())
|
|
|
|
break;
|
|
|
|
it--;
|
|
|
|
}
|
|
|
|
if (rm_start != rm_end)
|
|
|
|
{
|
|
|
|
bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
|
|
|
|
}
|
|
|
|
auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
|
|
|
|
if (unstab_it != bs->unstable_writes.end())
|
|
|
|
{
|
|
|
|
if (max_unstable == 0)
|
|
|
|
bs->unstable_writes.erase(unstab_it);
|
|
|
|
else
|
|
|
|
unstab_it->second = max_unstable;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-11-04 01:42:40 +03:00
|
|
|
else if (je->type == JE_DELETE)
|
|
|
|
{
|
2019-11-30 23:32:10 +03:00
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
|
|
|
#endif
|
2019-11-04 01:42:40 +03:00
|
|
|
// oid, version
|
2019-11-15 14:09:41 +03:00
|
|
|
obj_ver_id ov = {
|
2019-11-08 00:19:17 +03:00
|
|
|
.oid = je->del.oid,
|
|
|
|
.version = je->del.version,
|
2019-11-15 14:09:41 +03:00
|
|
|
};
|
|
|
|
bs->dirty_db.emplace(ov, (dirty_entry){
|
2019-11-04 15:46:33 +03:00
|
|
|
.state = ST_DEL_SYNCED,
|
|
|
|
.flags = 0,
|
|
|
|
.location = 0,
|
|
|
|
.offset = 0,
|
2019-11-14 01:13:07 +03:00
|
|
|
.len = 0,
|
2019-11-20 00:46:44 +03:00
|
|
|
.journal_sector = proc_pos,
|
2019-11-04 15:46:33 +03:00
|
|
|
});
|
2019-11-20 00:46:44 +03:00
|
|
|
bs->journal.used_sectors[proc_pos]++;
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-30 23:32:10 +03:00
|
|
|
started = true;
|
|
|
|
pos += je->size;
|
|
|
|
crc32_last = je->crc32;
|
2019-11-20 00:46:44 +03:00
|
|
|
entries_loaded++;
|
2019-11-04 01:42:40 +03:00
|
|
|
}
|
2019-11-04 15:46:33 +03:00
|
|
|
}
|
2019-11-30 02:27:31 +03:00
|
|
|
bs->journal.next_free = next_free;
|
2019-11-04 01:42:40 +03:00
|
|