forked from vitalif/vitastor
Add metadata superblock and check it on start
Refuse to start if the superblock is missing or bad version; zero out the metadata area when initializing superblock.rdma-zerocopy
parent
f684d9101a
commit
2a02f3c4c7
|
@ -51,7 +51,7 @@ async function run()
|
||||||
const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
|
const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
|
||||||
const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
|
const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
|
||||||
const object_count = Math.floor((device_size-meta_offset)/options.object_size);
|
const object_count = Math.floor((device_size-meta_offset)/options.object_size);
|
||||||
const meta_size = Math.ceil(object_count / entries_per_block) * options.device_block_size;
|
const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
|
||||||
const data_offset = meta_offset + meta_size;
|
const data_offset = meta_offset + meta_size;
|
||||||
const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB"
|
const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB"
|
||||||
: Math.round(meta_size/1024/1024*100)/100+" MB");
|
: Math.round(meta_size/1024/1024*100)/100+" MB");
|
||||||
|
@ -65,6 +65,9 @@ async function run()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
process.stdout.write(
|
process.stdout.write(
|
||||||
|
(options.device_block_size != 4096 ?
|
||||||
|
` --meta_block_size ${options.device}\n`+
|
||||||
|
` --journal_block-size ${options.device}\n` : '')+
|
||||||
` --data_device ${options.device}\n`+
|
` --data_device ${options.device}\n`+
|
||||||
` --journal_offset ${options.journal_offset}\n`+
|
` --journal_offset ${options.journal_offset}\n`+
|
||||||
` --meta_offset ${meta_offset}\n`+
|
` --meta_offset ${meta_offset}\n`+
|
||||||
|
|
|
@ -78,6 +78,23 @@
|
||||||
|
|
||||||
#include "blockstore_journal.h"
|
#include "blockstore_journal.h"
|
||||||
|
|
||||||
|
// "VITAstor"
|
||||||
|
#define BLOCKSTORE_META_MAGIC 0x726F747341544956l
|
||||||
|
#define BLOCKSTORE_META_VERSION 1
|
||||||
|
|
||||||
|
// metadata header (superblock)
|
||||||
|
// FIXME: After adding the OSD superblock, add a key to metadata
|
||||||
|
// and journal headers to check if they belong to the same OSD
|
||||||
|
struct __attribute__((__packed__)) blockstore_meta_header_t
|
||||||
|
{
|
||||||
|
uint64_t zero;
|
||||||
|
uint64_t magic;
|
||||||
|
uint64_t version;
|
||||||
|
uint32_t meta_block_size;
|
||||||
|
uint32_t data_block_size;
|
||||||
|
uint32_t bitmap_granularity;
|
||||||
|
};
|
||||||
|
|
||||||
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
||||||
// per "clean" entry on disk with fixed metadata tables
|
// per "clean" entry on disk with fixed metadata tables
|
||||||
// FIXME: maybe add crc32's to metadata
|
// FIXME: maybe add crc32's to metadata
|
||||||
|
|
|
@ -3,6 +3,20 @@
|
||||||
|
|
||||||
#include "blockstore_impl.h"
|
#include "blockstore_impl.h"
|
||||||
|
|
||||||
|
#define GET_SQE() \
|
||||||
|
sqe = bs->get_sqe();\
|
||||||
|
if (!sqe)\
|
||||||
|
throw std::runtime_error("io_uring is full during initialization");\
|
||||||
|
data = ((ring_data_t*)sqe->user_data)
|
||||||
|
|
||||||
|
static bool iszero(uint64_t *buf, int len)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
if (buf[i] != 0)
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
|
blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
|
||||||
{
|
{
|
||||||
this->bs = bs;
|
this->bs = bs;
|
||||||
|
@ -10,7 +24,7 @@ blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
|
||||||
|
|
||||||
void blockstore_init_meta::handle_event(ring_data_t *data)
|
void blockstore_init_meta::handle_event(ring_data_t *data)
|
||||||
{
|
{
|
||||||
if (data->res <= 0)
|
if (data->res < 0)
|
||||||
{
|
{
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
std::string("read metadata failed at offset ") + std::to_string(metadata_read) +
|
std::string("read metadata failed at offset ") + std::to_string(metadata_read) +
|
||||||
|
@ -28,6 +42,12 @@ int blockstore_init_meta::loop()
|
||||||
{
|
{
|
||||||
if (wait_state == 1)
|
if (wait_state == 1)
|
||||||
goto resume_1;
|
goto resume_1;
|
||||||
|
else if (wait_state == 2)
|
||||||
|
goto resume_2;
|
||||||
|
else if (wait_state == 3)
|
||||||
|
goto resume_3;
|
||||||
|
else if (wait_state == 4)
|
||||||
|
goto resume_4;
|
||||||
printf("Reading blockstore metadata\n");
|
printf("Reading blockstore metadata\n");
|
||||||
if (bs->inmemory_meta)
|
if (bs->inmemory_meta)
|
||||||
metadata_buffer = bs->metadata_buffer;
|
metadata_buffer = bs->metadata_buffer;
|
||||||
|
@ -35,22 +55,98 @@ int blockstore_init_meta::loop()
|
||||||
metadata_buffer = memalign(MEM_ALIGNMENT, 2*bs->metadata_buf_size);
|
metadata_buffer = memalign(MEM_ALIGNMENT, 2*bs->metadata_buf_size);
|
||||||
if (!metadata_buffer)
|
if (!metadata_buffer)
|
||||||
throw std::runtime_error("Failed to allocate metadata read buffer");
|
throw std::runtime_error("Failed to allocate metadata read buffer");
|
||||||
while (1)
|
// Read superblock
|
||||||
{
|
GET_SQE();
|
||||||
resume_1:
|
data->iov = { metadata_buffer, bs->meta_block_size };
|
||||||
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
||||||
|
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
|
||||||
|
bs->ringloop->submit();
|
||||||
|
submitted = 1;
|
||||||
|
resume_1:
|
||||||
if (submitted)
|
if (submitted)
|
||||||
{
|
{
|
||||||
wait_state = 1;
|
wait_state = 1;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
if (iszero((uint64_t*)metadata_buffer, bs->meta_block_size / sizeof(uint64_t)))
|
||||||
|
{
|
||||||
|
{
|
||||||
|
blockstore_meta_header_t *hdr = (blockstore_meta_header_t *)metadata_buffer;
|
||||||
|
hdr->zero = 0;
|
||||||
|
hdr->magic = BLOCKSTORE_META_MAGIC;
|
||||||
|
hdr->version = BLOCKSTORE_META_VERSION;
|
||||||
|
hdr->meta_block_size = bs->meta_block_size;
|
||||||
|
hdr->data_block_size = bs->block_size;
|
||||||
|
hdr->bitmap_granularity = bs->bitmap_granularity;
|
||||||
|
}
|
||||||
|
if (bs->readonly)
|
||||||
|
{
|
||||||
|
printf("Skipping metadata initialization because blockstore is readonly\n");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
printf("Initializing metadata area\n");
|
||||||
|
GET_SQE();
|
||||||
|
data->iov = (struct iovec){ metadata_buffer, bs->meta_block_size };
|
||||||
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
||||||
|
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
|
||||||
|
bs->ringloop->submit();
|
||||||
|
submitted = 1;
|
||||||
|
resume_3:
|
||||||
|
if (submitted > 0)
|
||||||
|
{
|
||||||
|
wait_state = 3;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
zero_on_init = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
blockstore_meta_header_t *hdr = (blockstore_meta_header_t *)metadata_buffer;
|
||||||
|
if (hdr->zero != 0 ||
|
||||||
|
hdr->magic != BLOCKSTORE_META_MAGIC ||
|
||||||
|
hdr->version != BLOCKSTORE_META_VERSION)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"Metadata is corrupt or old version.\n"
|
||||||
|
" If this is a new OSD please zero out the metadata area before starting it.\n"
|
||||||
|
" If you need to upgrade from 0.5.x please request it via the issue tracker.\n"
|
||||||
|
);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (hdr->meta_block_size != bs->meta_block_size ||
|
||||||
|
hdr->data_block_size != bs->block_size ||
|
||||||
|
hdr->bitmap_granularity != bs->bitmap_granularity)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"Configuration stored in metadata superblock"
|
||||||
|
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)"
|
||||||
|
" differs from OSD configuration (%lu/%u/%lu).\n",
|
||||||
|
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
|
||||||
|
bs->meta_block_size, bs->block_size, bs->bitmap_granularity
|
||||||
|
);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Skip superblock
|
||||||
|
bs->meta_offset += bs->meta_block_size;
|
||||||
|
prev_done = 0;
|
||||||
|
done_len = 0;
|
||||||
|
done_pos = 0;
|
||||||
|
metadata_read = 0;
|
||||||
|
// Read the rest of the metadata
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
resume_2:
|
||||||
|
if (submitted)
|
||||||
|
{
|
||||||
|
wait_state = 2;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
if (metadata_read < bs->meta_len)
|
if (metadata_read < bs->meta_len)
|
||||||
{
|
{
|
||||||
sqe = bs->get_sqe();
|
GET_SQE();
|
||||||
if (!sqe)
|
|
||||||
{
|
|
||||||
throw std::runtime_error("io_uring is full while trying to read metadata");
|
|
||||||
}
|
|
||||||
data = ((ring_data_t*)sqe->user_data);
|
|
||||||
data->iov = {
|
data->iov = {
|
||||||
metadata_buffer + (bs->inmemory_meta
|
metadata_buffer + (bs->inmemory_meta
|
||||||
? metadata_read
|
? metadata_read
|
||||||
|
@ -58,7 +154,14 @@ int blockstore_init_meta::loop()
|
||||||
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
|
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
|
||||||
};
|
};
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
||||||
|
if (!zero_on_init)
|
||||||
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
|
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Fill metadata with zeroes
|
||||||
|
memset(data->iov.iov_base, 0, data->iov.iov_len);
|
||||||
|
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
|
||||||
|
}
|
||||||
bs->ringloop->submit();
|
bs->ringloop->submit();
|
||||||
submitted = (prev == 1 ? 2 : 1);
|
submitted = (prev == 1 ? 2 : 1);
|
||||||
prev = submitted;
|
prev = submitted;
|
||||||
|
@ -90,6 +193,21 @@ int blockstore_init_meta::loop()
|
||||||
free(metadata_buffer);
|
free(metadata_buffer);
|
||||||
metadata_buffer = NULL;
|
metadata_buffer = NULL;
|
||||||
}
|
}
|
||||||
|
if (zero_on_init && !bs->disable_meta_fsync)
|
||||||
|
{
|
||||||
|
GET_SQE();
|
||||||
|
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
|
||||||
|
data->iov = { 0 };
|
||||||
|
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
||||||
|
submitted = 1;
|
||||||
|
bs->ringloop->submit();
|
||||||
|
resume_4:
|
||||||
|
if (submitted > 0)
|
||||||
|
{
|
||||||
|
wait_state = 4;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -156,14 +274,6 @@ blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool iszero(uint64_t *buf, int len)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < len; i++)
|
|
||||||
if (buf[i] != 0)
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void blockstore_init_journal::handle_event(ring_data_t *data1)
|
void blockstore_init_journal::handle_event(ring_data_t *data1)
|
||||||
{
|
{
|
||||||
if (data1->res <= 0)
|
if (data1->res <= 0)
|
||||||
|
@ -188,12 +298,6 @@ void blockstore_init_journal::handle_event(ring_data_t *data1)
|
||||||
submitted_buf = NULL;
|
submitted_buf = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_SQE() \
|
|
||||||
sqe = bs->get_sqe();\
|
|
||||||
if (!sqe)\
|
|
||||||
throw std::runtime_error("io_uring is full while trying to read journal");\
|
|
||||||
data = ((ring_data_t*)sqe->user_data)
|
|
||||||
|
|
||||||
int blockstore_init_journal::loop()
|
int blockstore_init_journal::loop()
|
||||||
{
|
{
|
||||||
if (wait_state == 1)
|
if (wait_state == 1)
|
||||||
|
@ -231,7 +335,7 @@ resume_1:
|
||||||
wait_state = 1;
|
wait_state = 1;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (iszero((uint64_t*)submitted_buf, bs->journal.block_size))
|
if (iszero((uint64_t*)submitted_buf, bs->journal.block_size / sizeof(uint64_t)))
|
||||||
{
|
{
|
||||||
// Journal is empty
|
// Journal is empty
|
||||||
// FIXME handle this wrapping to journal_block_size better (maybe)
|
// FIXME handle this wrapping to journal_block_size better (maybe)
|
||||||
|
|
|
@ -7,6 +7,7 @@ class blockstore_init_meta
|
||||||
{
|
{
|
||||||
blockstore_impl_t *bs;
|
blockstore_impl_t *bs;
|
||||||
int wait_state = 0, wait_count = 0;
|
int wait_state = 0, wait_count = 0;
|
||||||
|
bool zero_on_init = false;
|
||||||
void *metadata_buffer = NULL;
|
void *metadata_buffer = NULL;
|
||||||
uint64_t metadata_read = 0;
|
uint64_t metadata_read = 0;
|
||||||
int prev = 0, prev_done = 0, done_len = 0, submitted = 0;
|
int prev = 0, prev_done = 0, done_len = 0, submitted = 0;
|
||||||
|
|
|
@ -257,7 +257,7 @@ void blockstore_impl_t::calc_lengths()
|
||||||
}
|
}
|
||||||
// required metadata size
|
// required metadata size
|
||||||
block_count = data_len / block_size;
|
block_count = data_len / block_size;
|
||||||
meta_len = ((block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||||
if (meta_area < meta_len)
|
if (meta_area < meta_len)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
|
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
|
||||||
|
|
Loading…
Reference in New Issue