forked from vitalif/vitastor
Final fix for the lack of zeroing out of old metadata entries
If a crash occurs during flushing a redirect-write it may happen so that
the disk contains both old and new metadata entries. This is OK, but prior
to 0.8.0 after this situation OSDs started without problem, but then they
crashed after some more overwrites with a "tried to overwrite non-zero
metadata entry" error. 0.8.0 introduced a change that was intended to fix
this situation, but rather than fixing it it prevented OSDs from starting,
now because of a "big_write journal_entry was allocated over another object"
error... :-)
This change finally fixes the original issue.
Followup to 54ef2c389f
rm-left-on-dead
parent
6917a32ca8
commit
5464821fa5
|
@ -48,14 +48,12 @@ void blockstore_init_meta::handle_event(ring_data_t *data, int buf_num)
|
||||||
|
|
||||||
int blockstore_init_meta::loop()
|
int blockstore_init_meta::loop()
|
||||||
{
|
{
|
||||||
if (wait_state == 1)
|
if (wait_state == 1) goto resume_1;
|
||||||
goto resume_1;
|
else if (wait_state == 2) goto resume_2;
|
||||||
else if (wait_state == 2)
|
else if (wait_state == 3) goto resume_3;
|
||||||
goto resume_2;
|
else if (wait_state == 4) goto resume_4;
|
||||||
else if (wait_state == 3)
|
else if (wait_state == 5) goto resume_5;
|
||||||
goto resume_3;
|
else if (wait_state == 6) goto resume_6;
|
||||||
else if (wait_state == 4)
|
|
||||||
goto resume_4;
|
|
||||||
printf("Reading blockstore metadata\n");
|
printf("Reading blockstore metadata\n");
|
||||||
if (bs->inmemory_meta)
|
if (bs->inmemory_meta)
|
||||||
metadata_buffer = bs->metadata_buffer;
|
metadata_buffer = bs->metadata_buffer;
|
||||||
|
@ -140,6 +138,7 @@ resume_1:
|
||||||
// Skip superblock
|
// Skip superblock
|
||||||
md_offset = bs->dsk.meta_block_size;
|
md_offset = bs->dsk.meta_block_size;
|
||||||
next_offset = md_offset;
|
next_offset = md_offset;
|
||||||
|
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
|
||||||
// Read the rest of the metadata
|
// Read the rest of the metadata
|
||||||
resume_2:
|
resume_2:
|
||||||
if (next_offset < bs->dsk.meta_len && submitted == 0)
|
if (next_offset < bs->dsk.meta_len && submitted == 0)
|
||||||
|
@ -179,17 +178,16 @@ resume_2:
|
||||||
if (bufs[i].state == INIT_META_READ_DONE)
|
if (bufs[i].state == INIT_META_READ_DONE)
|
||||||
{
|
{
|
||||||
// Handle result
|
// Handle result
|
||||||
unsigned entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
|
|
||||||
bool changed = false;
|
bool changed = false;
|
||||||
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
|
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
|
||||||
{
|
{
|
||||||
// handle <count> entries
|
// handle <count> entries
|
||||||
changed = changed || handle_entries(
|
changed = changed || handle_meta_block(
|
||||||
bufs[i].buf + sector, entries_per_block,
|
bufs[i].buf + sector, entries_per_block,
|
||||||
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block
|
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (changed && !bs->inmemory_meta)
|
if (changed && !bs->inmemory_meta && !bs->readonly)
|
||||||
{
|
{
|
||||||
// write the modified buffer back
|
// write the modified buffer back
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
|
@ -211,6 +209,43 @@ resume_2:
|
||||||
wait_state = 2;
|
wait_state = 2;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
|
||||||
|
{
|
||||||
|
// we have to zero out additional entries
|
||||||
|
for (i = 0; i < entries_to_zero.size(); )
|
||||||
|
{
|
||||||
|
next_offset = entries_to_zero[i]/entries_per_block;
|
||||||
|
for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
|
||||||
|
GET_SQE();
|
||||||
|
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||||
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
|
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||||
|
submitted++;
|
||||||
|
resume_5:
|
||||||
|
if (submitted > 0)
|
||||||
|
{
|
||||||
|
wait_state = 5;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
for (; i < j; i++)
|
||||||
|
{
|
||||||
|
uint64_t pos = (entries_to_zero[i] % entries_per_block);
|
||||||
|
memset(metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||||
|
}
|
||||||
|
GET_SQE();
|
||||||
|
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||||
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
|
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||||
|
submitted++;
|
||||||
|
resume_6:
|
||||||
|
if (submitted > 0)
|
||||||
|
{
|
||||||
|
wait_state = 6;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entries_to_zero.clear();
|
||||||
|
}
|
||||||
// metadata read finished
|
// metadata read finished
|
||||||
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
|
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
|
||||||
if (!bs->inmemory_meta)
|
if (!bs->inmemory_meta)
|
||||||
|
@ -236,10 +271,13 @@ resume_2:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt)
|
bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
|
||||||
{
|
{
|
||||||
bool updated = false;
|
bool updated = false;
|
||||||
for (uint64_t i = 0; i < count; i++)
|
uint64_t max_i = entries_per_block;
|
||||||
|
if (max_i > bs->dsk.block_count-done_cnt)
|
||||||
|
max_i = bs->dsk.block_count-done_cnt;
|
||||||
|
for (uint64_t i = 0; i < max_i; i++)
|
||||||
{
|
{
|
||||||
clean_disk_entry *entry = (clean_disk_entry*)(buf + i*bs->dsk.clean_entry_size);
|
clean_disk_entry *entry = (clean_disk_entry*)(buf + i*bs->dsk.clean_entry_size);
|
||||||
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
|
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
|
||||||
|
@ -255,17 +293,35 @@ bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t
|
||||||
if (clean_it != clean_db.end())
|
if (clean_it != clean_db.end())
|
||||||
{
|
{
|
||||||
// free the previous block
|
// free the previous block
|
||||||
// here we have to zero out the entry because otherwise we'll hit
|
// here we have to zero out the previous entry because otherwise we'll hit
|
||||||
// "tried to overwrite non-zero metadata entry" later
|
// "tried to overwrite non-zero metadata entry" later
|
||||||
updated = true;
|
uint64_t old_clean_loc = clean_it->second.location >> bs->dsk.block_order;
|
||||||
memset(entry, 0, bs->dsk.clean_entry_size);
|
if (bs->inmemory_meta)
|
||||||
|
{
|
||||||
|
uint64_t sector = (old_clean_loc / entries_per_block) * bs->dsk.meta_block_size;
|
||||||
|
uint64_t pos = (old_clean_loc % entries_per_block);
|
||||||
|
clean_disk_entry *old_entry = (clean_disk_entry*)(bs->metadata_buffer + sector + pos*bs->dsk.clean_entry_size);
|
||||||
|
memset(old_entry, 0, bs->dsk.clean_entry_size);
|
||||||
|
}
|
||||||
|
else if (old_clean_loc >= done_cnt)
|
||||||
|
{
|
||||||
|
updated = true;
|
||||||
|
uint64_t sector = ((old_clean_loc - done_cnt) / entries_per_block) * bs->dsk.meta_block_size;
|
||||||
|
uint64_t pos = (old_clean_loc % entries_per_block);
|
||||||
|
clean_disk_entry *old_entry = (clean_disk_entry*)(buf + sector + pos*bs->dsk.clean_entry_size);
|
||||||
|
memset(old_entry, 0, bs->dsk.clean_entry_size);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
|
||||||
|
}
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||||
clean_it->second.location >> bs->dsk.block_order,
|
old_clean_loc,
|
||||||
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
||||||
done_cnt+i);
|
done_cnt+i);
|
||||||
#endif
|
#endif
|
||||||
bs->data_alloc->set(clean_it->second.location >> bs->dsk.block_order, false);
|
bs->data_alloc->set(old_clean_loc, false);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -24,7 +24,10 @@ class blockstore_init_meta
|
||||||
uint64_t md_offset = 0;
|
uint64_t md_offset = 0;
|
||||||
uint64_t next_offset = 0;
|
uint64_t next_offset = 0;
|
||||||
uint64_t entries_loaded = 0;
|
uint64_t entries_loaded = 0;
|
||||||
bool handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
unsigned entries_per_block = 0;
|
||||||
|
int i = 0, j = 0;
|
||||||
|
std::vector<uint64_t> entries_to_zero;
|
||||||
|
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
||||||
void handle_event(ring_data_t *data, int buf_num);
|
void handle_event(ring_data_t *data, int buf_num);
|
||||||
public:
|
public:
|
||||||
blockstore_init_meta(blockstore_impl_t *bs);
|
blockstore_init_meta(blockstore_impl_t *bs);
|
||||||
|
|
Loading…
Reference in New Issue