2020-09-17 23:02:40 +03:00
|
|
|
// Copyright (c) Vitaliy Filippov, 2019+
|
2021-02-06 01:26:07 +03:00
|
|
|
// License: VNPL-1.1 (see README.md for details)
|
2020-09-17 23:02:40 +03:00
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
#include "blockstore_impl.h"
|
2019-11-11 00:28:14 +03:00
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
|
2019-11-11 00:28:14 +03:00
|
|
|
{
|
|
|
|
this->bs = bs;
|
2021-02-02 01:30:23 +03:00
|
|
|
sectors_to_write = 0;
|
2019-11-11 00:28:14 +03:00
|
|
|
next_pos = bs->journal.next_free;
|
|
|
|
next_sector = bs->journal.cur_sector;
|
2020-05-24 17:22:12 +03:00
|
|
|
first_sector = -1;
|
2019-11-11 00:28:14 +03:00
|
|
|
next_in_pos = bs->journal.in_sector_pos;
|
2019-11-27 11:35:11 +03:00
|
|
|
right_dir = next_pos >= bs->journal.used_start;
|
2019-11-11 00:28:14 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
|
2020-05-24 17:22:12 +03:00
|
|
|
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
|
2019-11-11 00:28:14 +03:00
|
|
|
{
|
2020-05-24 17:22:12 +03:00
|
|
|
int required = entries_required;
|
2019-11-11 00:28:14 +03:00
|
|
|
while (1)
|
|
|
|
{
|
2021-02-22 00:07:05 +03:00
|
|
|
int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
|
2020-09-12 19:14:51 +03:00
|
|
|
? 0
|
|
|
|
: (bs->journal.block_size - next_in_pos) / size;
|
2019-11-11 00:28:14 +03:00
|
|
|
if (fits > 0)
|
|
|
|
{
|
2021-02-02 01:29:11 +03:00
|
|
|
if (fits > required)
|
|
|
|
{
|
|
|
|
fits = required;
|
|
|
|
}
|
2020-05-24 17:22:12 +03:00
|
|
|
if (first_sector == -1)
|
|
|
|
{
|
|
|
|
first_sector = next_sector;
|
|
|
|
}
|
2019-11-11 00:28:14 +03:00
|
|
|
required -= fits;
|
|
|
|
next_in_pos += fits * size;
|
2021-02-02 01:30:23 +03:00
|
|
|
sectors_to_write++;
|
2019-11-11 00:28:14 +03:00
|
|
|
}
|
2020-01-15 01:55:30 +03:00
|
|
|
else if (bs->journal.sector_info[next_sector].dirty)
|
|
|
|
{
|
2021-02-02 01:30:23 +03:00
|
|
|
sectors_to_write++;
|
2020-01-15 01:55:30 +03:00
|
|
|
}
|
2019-11-11 00:28:14 +03:00
|
|
|
if (required <= 0)
|
2019-11-19 18:07:40 +03:00
|
|
|
{
|
2019-11-11 00:28:14 +03:00
|
|
|
break;
|
2019-11-19 18:07:40 +03:00
|
|
|
}
|
2020-01-16 00:35:35 +03:00
|
|
|
next_pos = next_pos + bs->journal.block_size;
|
2019-11-19 18:07:40 +03:00
|
|
|
if (next_pos >= bs->journal.len)
|
|
|
|
{
|
2020-01-16 00:35:35 +03:00
|
|
|
next_pos = bs->journal.block_size;
|
2019-11-27 11:35:11 +03:00
|
|
|
right_dir = false;
|
2019-11-19 18:07:40 +03:00
|
|
|
}
|
2019-11-11 00:28:14 +03:00
|
|
|
next_in_pos = 0;
|
2020-05-24 17:22:12 +03:00
|
|
|
next_sector = ((next_sector + 1) % bs->journal.sector_count);
|
|
|
|
if (next_sector == first_sector)
|
2019-11-19 18:07:40 +03:00
|
|
|
{
|
2020-05-24 17:22:12 +03:00
|
|
|
// next_sector may wrap when all sectors are flushed and the incoming batch is too big
|
|
|
|
// This is an error condition, we can't wait for anything in this case
|
|
|
|
throw std::runtime_error(
|
|
|
|
"Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
|
|
|
|
" is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
|
|
|
|
);
|
2019-11-19 18:07:40 +03:00
|
|
|
}
|
2021-02-02 01:32:13 +03:00
|
|
|
if (bs->journal.sector_info[next_sector].flush_count > 0 ||
|
2020-01-15 01:55:30 +03:00
|
|
|
bs->journal.sector_info[next_sector].dirty)
|
2019-11-11 00:28:14 +03:00
|
|
|
{
|
|
|
|
// No memory buffer available. Wait for it.
|
2020-05-24 16:48:50 +03:00
|
|
|
int used = 0, dirty = 0;
|
|
|
|
for (int i = 0; i < bs->journal.sector_count; i++)
|
|
|
|
{
|
|
|
|
if (bs->journal.sector_info[i].dirty)
|
|
|
|
{
|
|
|
|
dirty++;
|
|
|
|
used++;
|
|
|
|
}
|
2021-02-02 01:32:13 +03:00
|
|
|
if (bs->journal.sector_info[i].flush_count > 0)
|
2020-05-24 16:48:50 +03:00
|
|
|
{
|
|
|
|
used++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// In fact, it's even more rare than "ran out of journal space", so print a warning
|
|
|
|
printf(
|
2021-02-24 01:32:52 +03:00
|
|
|
"Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)"
|
|
|
|
" is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
|
2020-05-24 16:48:50 +03:00
|
|
|
used, bs->journal.sector_count, dirty, next_sector,
|
|
|
|
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
|
2021-02-02 01:32:13 +03:00
|
|
|
bs->journal.sector_info[next_sector].flush_count
|
2020-05-24 16:48:50 +03:00
|
|
|
);
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
|
2019-11-11 00:28:14 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (data_after > 0)
|
|
|
|
{
|
2019-11-19 18:07:40 +03:00
|
|
|
next_pos = next_pos + data_after;
|
|
|
|
if (next_pos > bs->journal.len)
|
|
|
|
{
|
2020-01-16 00:35:35 +03:00
|
|
|
next_pos = bs->journal.block_size + data_after;
|
2019-11-27 11:35:11 +03:00
|
|
|
right_dir = false;
|
2019-11-19 18:07:40 +03:00
|
|
|
}
|
2019-11-11 00:28:14 +03:00
|
|
|
}
|
2020-01-16 00:35:35 +03:00
|
|
|
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
|
2019-11-11 00:28:14 +03:00
|
|
|
{
|
2019-11-27 18:04:52 +03:00
|
|
|
// No space in the journal. Wait until used_start changes.
|
2020-02-26 18:32:00 +03:00
|
|
|
printf(
|
2021-02-24 01:32:52 +03:00
|
|
|
"Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n",
|
|
|
|
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
2020-02-26 18:32:00 +03:00
|
|
|
);
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->wait_for = WAIT_JOURNAL;
|
2020-06-01 16:12:26 +03:00
|
|
|
bs->flusher->request_trim();
|
2019-12-15 14:49:10 +03:00
|
|
|
PRIV(op)->wait_detail = bs->journal.used_start;
|
2019-11-11 00:28:14 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
2019-11-11 18:24:04 +03:00
|
|
|
|
2019-11-19 18:07:40 +03:00
|
|
|
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
|
|
|
|
{
|
2021-02-02 01:29:11 +03:00
|
|
|
if (!journal.entry_fits(size))
|
2019-11-19 18:07:40 +03:00
|
|
|
{
|
2020-01-15 01:55:30 +03:00
|
|
|
assert(!journal.sector_info[journal.cur_sector].dirty);
|
2019-11-19 18:07:40 +03:00
|
|
|
// Move to the next journal sector
|
2021-02-02 01:32:13 +03:00
|
|
|
if (journal.sector_info[journal.cur_sector].flush_count > 0)
|
2019-11-19 18:07:40 +03:00
|
|
|
{
|
|
|
|
// Also select next sector buffer in memory
|
|
|
|
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
|
2021-02-02 01:32:13 +03:00
|
|
|
assert(!journal.sector_info[journal.cur_sector].flush_count);
|
2019-11-19 18:07:40 +03:00
|
|
|
}
|
2020-03-07 17:36:58 +03:00
|
|
|
else
|
|
|
|
{
|
|
|
|
journal.dirty_start = journal.next_free;
|
|
|
|
}
|
2021-02-02 01:29:11 +03:00
|
|
|
journal.sector_info[journal.cur_sector].written = false;
|
2019-11-19 18:07:40 +03:00
|
|
|
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
|
|
|
journal.in_sector_pos = 0;
|
2020-01-16 00:35:35 +03:00
|
|
|
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
2019-11-28 14:41:03 +03:00
|
|
|
memset(journal.inmemory
|
|
|
|
? journal.buffer + journal.sector_info[journal.cur_sector].offset
|
2020-01-16 00:35:35 +03:00
|
|
|
: journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
2019-11-19 18:07:40 +03:00
|
|
|
}
|
|
|
|
journal_entry *je = (struct journal_entry*)(
|
2019-11-28 14:41:03 +03:00
|
|
|
(journal.inmemory
|
|
|
|
? journal.buffer + journal.sector_info[journal.cur_sector].offset
|
2020-01-16 00:35:35 +03:00
|
|
|
: journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
|
2019-11-19 18:07:40 +03:00
|
|
|
);
|
|
|
|
journal.in_sector_pos += size;
|
|
|
|
je->magic = JOURNAL_MAGIC;
|
|
|
|
je->type = type;
|
|
|
|
je->size = size;
|
|
|
|
je->crc32_prev = journal.crc32_last;
|
2020-01-15 01:55:30 +03:00
|
|
|
journal.sector_info[journal.cur_sector].dirty = true;
|
2019-11-19 18:07:40 +03:00
|
|
|
return je;
|
|
|
|
}
|
|
|
|
|
2021-12-15 02:43:12 +03:00
|
|
|
void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_op_t *op)
|
2019-11-11 18:24:04 +03:00
|
|
|
{
|
2021-12-15 02:43:12 +03:00
|
|
|
// Don't submit the same sector twice in the same batch
|
|
|
|
if (!journal.sector_info[cur_sector].submit_id)
|
|
|
|
{
|
|
|
|
io_uring_sqe *sqe = get_sqe();
|
|
|
|
// Caller must ensure availability of an SQE
|
|
|
|
assert(sqe != NULL);
|
|
|
|
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
|
|
|
journal.sector_info[cur_sector].written = true;
|
|
|
|
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
|
|
|
|
journal.submitting_sectors.push_back(cur_sector);
|
|
|
|
journal.sector_info[cur_sector].flush_count++;
|
|
|
|
data->iov = (struct iovec){
|
|
|
|
(journal.inmemory
|
|
|
|
? journal.buffer + journal.sector_info[cur_sector].offset
|
|
|
|
: journal.sector_buf + journal.block_size*cur_sector),
|
|
|
|
journal.block_size
|
|
|
|
};
|
|
|
|
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
|
|
|
|
my_uring_prep_writev(
|
|
|
|
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
|
|
|
|
);
|
|
|
|
}
|
2020-01-15 01:55:30 +03:00
|
|
|
journal.sector_info[cur_sector].dirty = false;
|
2021-12-15 02:43:12 +03:00
|
|
|
// But always remember that this operation has to wait until this exact journal write is finished
|
|
|
|
journal.flushing_ops.insert((pending_journaling_t){
|
|
|
|
.flush_id = journal.sector_info[cur_sector].submit_id,
|
|
|
|
.sector = cur_sector,
|
|
|
|
.op = op,
|
|
|
|
});
|
|
|
|
auto priv = PRIV(op);
|
|
|
|
priv->pending_ops++;
|
|
|
|
if (!priv->min_flushed_journal_sector)
|
|
|
|
priv->min_flushed_journal_sector = 1+cur_sector;
|
|
|
|
priv->max_flushed_journal_sector = 1+cur_sector;
|
|
|
|
}
|
|
|
|
|
|
|
|
void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_id)
|
|
|
|
{
|
|
|
|
live = true;
|
|
|
|
if (data->res != data->iov.iov_len)
|
|
|
|
{
|
|
|
|
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
|
|
|
|
throw std::runtime_error(
|
|
|
|
"journal write failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
|
|
|
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
|
|
|
|
if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
|
|
|
|
{
|
|
|
|
journal.sector_info[fl_it->sector].flush_count--;
|
|
|
|
}
|
|
|
|
while (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
|
|
|
|
{
|
|
|
|
auto priv = PRIV(fl_it->op);
|
|
|
|
priv->pending_ops--;
|
|
|
|
assert(priv->pending_ops >= 0);
|
|
|
|
if (priv->pending_ops == 0)
|
|
|
|
{
|
|
|
|
release_journal_sectors(fl_it->op);
|
|
|
|
priv->op_state++;
|
|
|
|
ringloop->wakeup();
|
|
|
|
}
|
|
|
|
journal.flushing_ops.erase(fl_it++);
|
|
|
|
}
|
2019-11-11 18:24:04 +03:00
|
|
|
}
|
2019-11-28 14:41:03 +03:00
|
|
|
|
|
|
|
journal_t::~journal_t()
|
|
|
|
{
|
|
|
|
if (sector_buf)
|
|
|
|
free(sector_buf);
|
|
|
|
if (sector_info)
|
|
|
|
free(sector_info);
|
|
|
|
if (buffer)
|
|
|
|
free(buffer);
|
|
|
|
sector_buf = NULL;
|
|
|
|
sector_info = NULL;
|
|
|
|
buffer = NULL;
|
|
|
|
}
|
2019-11-29 02:13:30 +03:00
|
|
|
|
2020-10-24 00:27:03 +03:00
|
|
|
uint64_t journal_t::get_trim_pos()
|
2019-11-29 02:13:30 +03:00
|
|
|
{
|
|
|
|
auto journal_used_it = used_sectors.lower_bound(used_start);
|
|
|
|
#ifdef BLOCKSTORE_DEBUG
|
|
|
|
printf(
|
2020-06-01 01:55:54 +03:00
|
|
|
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
|
|
|
used_start, next_free, dirty_start,
|
2019-11-29 02:13:30 +03:00
|
|
|
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
|
|
|
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
|
|
|
);
|
|
|
|
#endif
|
|
|
|
if (journal_used_it == used_sectors.end())
|
|
|
|
{
|
|
|
|
// Journal is cleared to its end, restart from the beginning
|
|
|
|
journal_used_it = used_sectors.begin();
|
|
|
|
if (journal_used_it == used_sectors.end())
|
|
|
|
{
|
|
|
|
// Journal is empty
|
2020-10-24 00:27:03 +03:00
|
|
|
return next_free;
|
2019-11-29 02:13:30 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-10-24 00:27:03 +03:00
|
|
|
// next_free does not need updating during trim
|
|
|
|
return journal_used_it->first;
|
2019-11-29 02:13:30 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (journal_used_it->first > used_start)
|
|
|
|
{
|
|
|
|
// Journal is cleared up to <journal_used_it>
|
2020-10-24 00:27:03 +03:00
|
|
|
return journal_used_it->first;
|
2019-11-29 02:13:30 +03:00
|
|
|
}
|
2020-10-24 00:27:03 +03:00
|
|
|
// Can't trim journal
|
|
|
|
return used_start;
|
2019-11-29 02:13:30 +03:00
|
|
|
}
|
2021-07-17 16:13:41 +03:00
|
|
|
|
|
|
|
void journal_t::dump_diagnostics()
|
|
|
|
{
|
|
|
|
auto journal_used_it = used_sectors.lower_bound(used_start);
|
|
|
|
if (journal_used_it == used_sectors.end())
|
|
|
|
{
|
|
|
|
// Journal is cleared to its end, restart from the beginning
|
|
|
|
journal_used_it = used_sectors.begin();
|
|
|
|
}
|
|
|
|
printf(
|
|
|
|
"Journal: used_start=%08lx next_free=%08lx dirty_start=%08lx trim_to=%08lx trim_to_refs=%ld\n",
|
|
|
|
used_start, next_free, dirty_start,
|
|
|
|
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
|
|
|
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
|
|
|
);
|
|
|
|
}
|