Support using Linux page cache for reads

cached-reads
Vitaliy Filippov 2023-07-14 23:32:07 +03:00
parent e4ea8a9514
commit 98d5849190
7 changed files with 91 additions and 11 deletions

View File

@ -45,6 +45,13 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
meta_block_size = parse_size(config["meta_block_size"]);
bitmap_granularity = parse_size(config["bitmap_granularity"]);
meta_format = stoull_full(config["meta_format"]);
cached_read_data = config["cached_read_data"] == "true" || config["cached_read_data"] == "yes" || config["cached_read_data"] == "1";
cached_read_meta = cached_read_data && (meta_device == data_device || meta_device == "") &&
config.find("cached_read_meta") == config.end() ||
config["cached_read_meta"] == "true" || config["cached_read_meta"] == "yes" || config["cached_read_meta"] == "1";
cached_read_journal = cached_read_meta && (journal_device == meta_device || journal_device == "") &&
config.find("cached_read_journal") == config.end() ||
config["cached_read_journal"] == "true" || config["cached_read_journal"] == "yes" || config["cached_read_journal"] == "1";
if (config["data_csum_type"] == "crc32c")
{
data_csum_type = BLOCKSTORE_CSUM_CRC32C;
@ -288,6 +295,18 @@ void blockstore_disk_t::open_data()
{
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
}
if (cached_read_data)
{
read_data_fd = open(data_device.c_str(), O_RDWR);
if (read_data_fd == -1)
{
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
}
}
else
{
read_data_fd = data_fd;
}
}
void blockstore_disk_t::open_meta()
@ -308,6 +327,18 @@ void blockstore_disk_t::open_meta()
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
if (cached_read_meta)
{
read_meta_fd = open(meta_device.c_str(), O_RDWR);
if (read_meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
}
}
else
{
read_meta_fd = meta_fd;
}
}
else
{
@ -326,6 +357,22 @@ void blockstore_disk_t::open_meta()
") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
);
}
if (!cached_read_meta)
{
read_meta_fd = meta_fd;
}
else if (meta_device == data_device && cached_read_data)
{
read_meta_fd = read_data_fd;
}
else
{
read_meta_fd = open(meta_device.c_str(), O_RDWR);
if (read_meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
}
}
}
void blockstore_disk_t::open_journal()
@ -360,6 +407,26 @@ void blockstore_disk_t::open_journal()
") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
);
}
if (!cached_read_journal)
{
read_journal_fd = journal_fd;
}
else if (journal_device == meta_device && cached_read_meta)
{
read_journal_fd = read_meta_fd;
}
else if (journal_device == data_device && cached_read_data)
{
read_journal_fd = read_data_fd;
}
else
{
read_journal_fd = open(journal_device.c_str(), O_RDWR);
if (read_journal_fd == -1)
{
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
}
}
}
void blockstore_disk_t::close_all()
@ -370,5 +437,12 @@ void blockstore_disk_t::close_all()
close(meta_fd);
if (journal_fd >= 0 && journal_fd != meta_fd)
close(journal_fd);
if (read_data_fd >= 0 && read_data_fd != data_fd)
close(read_data_fd);
if (read_meta_fd >= 0 && read_meta_fd != meta_fd)
close(read_meta_fd);
if (read_journal_fd >= 0 && read_journal_fd != journal_fd)
close(read_journal_fd);
data_fd = meta_fd = journal_fd = -1;
read_data_fd = read_meta_fd = read_journal_fd = -1;
}

View File

@ -31,8 +31,11 @@ struct blockstore_disk_t
uint32_t csum_block_size = 4096;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
// Use linux page cache for reads. If enabled, separate buffered FDs will be opened for reading
bool cached_read_data = false, cached_read_meta = false, cached_read_journal = false;
int meta_fd = -1, data_fd = -1, journal_fd = -1;
int read_meta_fd = -1, read_data_fd = -1, read_journal_fd = -1;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
uint64_t data_offset, data_device_sect, data_device_size, data_len;
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;

View File

@ -1087,7 +1087,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
data->iov = (struct iovec){ vi.buf, vi.len };
data->callback = simple_callback_r;
my_uring_prep_readv(
sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
sqe, bs->dsk.read_data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
);
wait_count++;
bs->find_holes(v, vi.offset, vi.offset+vi.len, [this, buf = (uint8_t*)vi.buf-vi.offset](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
@ -1119,7 +1119,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
data->iov = (struct iovec){ v[i].buf, (size_t)v[i].len };
data->callback = simple_callback_rj;
my_uring_prep_readv(
sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset
sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset
);
wait_journal_count++;
}
@ -1212,7 +1212,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
data->callback = simple_callback_r;
wr.submitted = true;
my_uring_prep_readv(
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector
sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector
);
wait_count++;
}

View File

@ -65,7 +65,7 @@ int blockstore_init_meta::loop()
GET_SQE();
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset);
bs->ringloop->submit();
submitted++;
resume_1:
@ -202,7 +202,7 @@ resume_2:
data->iov = { bufs[i].buf, bufs[i].size };
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
if (!zero_on_init)
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
else
{
// Fill metadata with zeroes
@ -259,7 +259,7 @@ resume_2:
GET_SQE();
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
submitted++;
resume_5:
if (submitted > 0)
@ -467,7 +467,7 @@ int blockstore_init_journal::loop()
data = ((ring_data_t*)sqe->user_data);
data->iov = { submitted_buf, bs->journal.block_size };
data->callback = simple_callback;
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset);
bs->ringloop->submit();
wait_count = 1;
resume_1:
@ -607,7 +607,7 @@ resume_1:
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
};
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
bs->ringloop->submit();
}
while (done.size() > 0)

View File

@ -29,7 +29,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
PRIV(op)->pending_ops++;
my_uring_prep_readv(
sqe,
IS_JOURNAL(item_state) ? dsk.journal_fd : dsk.data_fd,
IS_JOURNAL(item_state) ? dsk.read_journal_fd : dsk.read_data_fd,
&data->iov, 1,
(IS_JOURNAL(item_state) ? dsk.journal_offset : dsk.data_offset) + offset
);
@ -348,7 +348,7 @@ bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uin
.csum_buf = vi->csum_buf,
.dyn_data = vi->dyn_data,
};
int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd);
int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.read_journal_fd : dsk.read_data_fd);
uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset);
uint32_t d_pos = 0;
for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX)
@ -702,7 +702,7 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
BS_SUBMIT_GET_SQE(sqe, data);
data->iov = (struct iovec){ buf, dsk.meta_block_size };
PRIV(op)->pending_ops++;
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
my_uring_prep_readv(sqe, dsk.read_meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
// return pointer to checksums + bitmap
return buf + pos + sizeof(clean_disk_entry);

View File

@ -116,6 +116,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
try
{
dsk.parse_config(options);
dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false;
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
@ -479,6 +480,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
{
blockstore_disk_t dsk;
dsk.parse_config(options);
dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false;
dsk.open_data();
dsk.open_meta();
dsk.open_journal();

View File

@ -91,6 +91,7 @@ int disk_tool_t::resize_parse_params()
try
{
dsk.parse_config(options);
dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false;
dsk.open_data();
dsk.open_meta();
dsk.open_journal();