Support using Linux page cache for reads
parent
e4ea8a9514
commit
98d5849190
|
@ -45,6 +45,13 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||||
meta_block_size = parse_size(config["meta_block_size"]);
|
meta_block_size = parse_size(config["meta_block_size"]);
|
||||||
bitmap_granularity = parse_size(config["bitmap_granularity"]);
|
bitmap_granularity = parse_size(config["bitmap_granularity"]);
|
||||||
meta_format = stoull_full(config["meta_format"]);
|
meta_format = stoull_full(config["meta_format"]);
|
||||||
|
cached_read_data = config["cached_read_data"] == "true" || config["cached_read_data"] == "yes" || config["cached_read_data"] == "1";
|
||||||
|
cached_read_meta = cached_read_data && (meta_device == data_device || meta_device == "") &&
|
||||||
|
config.find("cached_read_meta") == config.end() ||
|
||||||
|
config["cached_read_meta"] == "true" || config["cached_read_meta"] == "yes" || config["cached_read_meta"] == "1";
|
||||||
|
cached_read_journal = cached_read_meta && (journal_device == meta_device || journal_device == "") &&
|
||||||
|
config.find("cached_read_journal") == config.end() ||
|
||||||
|
config["cached_read_journal"] == "true" || config["cached_read_journal"] == "yes" || config["cached_read_journal"] == "1";
|
||||||
if (config["data_csum_type"] == "crc32c")
|
if (config["data_csum_type"] == "crc32c")
|
||||||
{
|
{
|
||||||
data_csum_type = BLOCKSTORE_CSUM_CRC32C;
|
data_csum_type = BLOCKSTORE_CSUM_CRC32C;
|
||||||
|
@ -288,6 +295,18 @@ void blockstore_disk_t::open_data()
|
||||||
{
|
{
|
||||||
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
|
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
|
||||||
}
|
}
|
||||||
|
if (cached_read_data)
|
||||||
|
{
|
||||||
|
read_data_fd = open(data_device.c_str(), O_RDWR);
|
||||||
|
if (read_data_fd == -1)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
read_data_fd = data_fd;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_disk_t::open_meta()
|
void blockstore_disk_t::open_meta()
|
||||||
|
@ -308,6 +327,18 @@ void blockstore_disk_t::open_meta()
|
||||||
{
|
{
|
||||||
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
|
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
|
||||||
}
|
}
|
||||||
|
if (cached_read_meta)
|
||||||
|
{
|
||||||
|
read_meta_fd = open(meta_device.c_str(), O_RDWR);
|
||||||
|
if (read_meta_fd == -1)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
read_meta_fd = meta_fd;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -326,6 +357,22 @@ void blockstore_disk_t::open_meta()
|
||||||
") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
|
") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (!cached_read_meta)
|
||||||
|
{
|
||||||
|
read_meta_fd = meta_fd;
|
||||||
|
}
|
||||||
|
else if (meta_device == data_device && cached_read_data)
|
||||||
|
{
|
||||||
|
read_meta_fd = read_data_fd;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
read_meta_fd = open(meta_device.c_str(), O_RDWR);
|
||||||
|
if (read_meta_fd == -1)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_disk_t::open_journal()
|
void blockstore_disk_t::open_journal()
|
||||||
|
@ -360,6 +407,26 @@ void blockstore_disk_t::open_journal()
|
||||||
") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
|
") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (!cached_read_journal)
|
||||||
|
{
|
||||||
|
read_journal_fd = journal_fd;
|
||||||
|
}
|
||||||
|
else if (journal_device == meta_device && cached_read_meta)
|
||||||
|
{
|
||||||
|
read_journal_fd = read_meta_fd;
|
||||||
|
}
|
||||||
|
else if (journal_device == data_device && cached_read_data)
|
||||||
|
{
|
||||||
|
read_journal_fd = read_data_fd;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
read_journal_fd = open(journal_device.c_str(), O_RDWR);
|
||||||
|
if (read_journal_fd == -1)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_disk_t::close_all()
|
void blockstore_disk_t::close_all()
|
||||||
|
@ -370,5 +437,12 @@ void blockstore_disk_t::close_all()
|
||||||
close(meta_fd);
|
close(meta_fd);
|
||||||
if (journal_fd >= 0 && journal_fd != meta_fd)
|
if (journal_fd >= 0 && journal_fd != meta_fd)
|
||||||
close(journal_fd);
|
close(journal_fd);
|
||||||
|
if (read_data_fd >= 0 && read_data_fd != data_fd)
|
||||||
|
close(read_data_fd);
|
||||||
|
if (read_meta_fd >= 0 && read_meta_fd != meta_fd)
|
||||||
|
close(read_meta_fd);
|
||||||
|
if (read_journal_fd >= 0 && read_journal_fd != journal_fd)
|
||||||
|
close(read_journal_fd);
|
||||||
data_fd = meta_fd = journal_fd = -1;
|
data_fd = meta_fd = journal_fd = -1;
|
||||||
|
read_data_fd = read_meta_fd = read_journal_fd = -1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,8 +31,11 @@ struct blockstore_disk_t
|
||||||
uint32_t csum_block_size = 4096;
|
uint32_t csum_block_size = 4096;
|
||||||
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
|
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
|
||||||
bool disable_flock = false;
|
bool disable_flock = false;
|
||||||
|
// Use linux page cache for reads. If enabled, separate buffered FDs will be opened for reading
|
||||||
|
bool cached_read_data = false, cached_read_meta = false, cached_read_journal = false;
|
||||||
|
|
||||||
int meta_fd = -1, data_fd = -1, journal_fd = -1;
|
int meta_fd = -1, data_fd = -1, journal_fd = -1;
|
||||||
|
int read_meta_fd = -1, read_data_fd = -1, read_journal_fd = -1;
|
||||||
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
|
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
|
||||||
uint64_t data_offset, data_device_sect, data_device_size, data_len;
|
uint64_t data_offset, data_device_sect, data_device_size, data_len;
|
||||||
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
|
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
|
||||||
|
|
|
@ -1087,7 +1087,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
|
||||||
data->iov = (struct iovec){ vi.buf, vi.len };
|
data->iov = (struct iovec){ vi.buf, vi.len };
|
||||||
data->callback = simple_callback_r;
|
data->callback = simple_callback_r;
|
||||||
my_uring_prep_readv(
|
my_uring_prep_readv(
|
||||||
sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
|
sqe, bs->dsk.read_data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
|
||||||
);
|
);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
bs->find_holes(v, vi.offset, vi.offset+vi.len, [this, buf = (uint8_t*)vi.buf-vi.offset](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
|
bs->find_holes(v, vi.offset, vi.offset+vi.len, [this, buf = (uint8_t*)vi.buf-vi.offset](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
|
||||||
|
@ -1119,7 +1119,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
|
||||||
data->iov = (struct iovec){ v[i].buf, (size_t)v[i].len };
|
data->iov = (struct iovec){ v[i].buf, (size_t)v[i].len };
|
||||||
data->callback = simple_callback_rj;
|
data->callback = simple_callback_rj;
|
||||||
my_uring_prep_readv(
|
my_uring_prep_readv(
|
||||||
sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset
|
sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset
|
||||||
);
|
);
|
||||||
wait_journal_count++;
|
wait_journal_count++;
|
||||||
}
|
}
|
||||||
|
@ -1212,7 +1212,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
|
||||||
data->callback = simple_callback_r;
|
data->callback = simple_callback_r;
|
||||||
wr.submitted = true;
|
wr.submitted = true;
|
||||||
my_uring_prep_readv(
|
my_uring_prep_readv(
|
||||||
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector
|
sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector
|
||||||
);
|
);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
}
|
}
|
||||||
|
|
|
@ -65,7 +65,7 @@ int blockstore_init_meta::loop()
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
||||||
bs->ringloop->submit();
|
bs->ringloop->submit();
|
||||||
submitted++;
|
submitted++;
|
||||||
resume_1:
|
resume_1:
|
||||||
|
@ -202,7 +202,7 @@ resume_2:
|
||||||
data->iov = { bufs[i].buf, bufs[i].size };
|
data->iov = { bufs[i].buf, bufs[i].size };
|
||||||
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
||||||
if (!zero_on_init)
|
if (!zero_on_init)
|
||||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Fill metadata with zeroes
|
// Fill metadata with zeroes
|
||||||
|
@ -259,7 +259,7 @@ resume_2:
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||||
submitted++;
|
submitted++;
|
||||||
resume_5:
|
resume_5:
|
||||||
if (submitted > 0)
|
if (submitted > 0)
|
||||||
|
@ -467,7 +467,7 @@ int blockstore_init_journal::loop()
|
||||||
data = ((ring_data_t*)sqe->user_data);
|
data = ((ring_data_t*)sqe->user_data);
|
||||||
data->iov = { submitted_buf, bs->journal.block_size };
|
data->iov = { submitted_buf, bs->journal.block_size };
|
||||||
data->callback = simple_callback;
|
data->callback = simple_callback;
|
||||||
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset);
|
||||||
bs->ringloop->submit();
|
bs->ringloop->submit();
|
||||||
wait_count = 1;
|
wait_count = 1;
|
||||||
resume_1:
|
resume_1:
|
||||||
|
@ -607,7 +607,7 @@ resume_1:
|
||||||
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
||||||
};
|
};
|
||||||
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
|
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
|
||||||
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
|
my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
|
||||||
bs->ringloop->submit();
|
bs->ringloop->submit();
|
||||||
}
|
}
|
||||||
while (done.size() > 0)
|
while (done.size() > 0)
|
||||||
|
|
|
@ -29,7 +29,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
|
||||||
PRIV(op)->pending_ops++;
|
PRIV(op)->pending_ops++;
|
||||||
my_uring_prep_readv(
|
my_uring_prep_readv(
|
||||||
sqe,
|
sqe,
|
||||||
IS_JOURNAL(item_state) ? dsk.journal_fd : dsk.data_fd,
|
IS_JOURNAL(item_state) ? dsk.read_journal_fd : dsk.read_data_fd,
|
||||||
&data->iov, 1,
|
&data->iov, 1,
|
||||||
(IS_JOURNAL(item_state) ? dsk.journal_offset : dsk.data_offset) + offset
|
(IS_JOURNAL(item_state) ? dsk.journal_offset : dsk.data_offset) + offset
|
||||||
);
|
);
|
||||||
|
@ -348,7 +348,7 @@ bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uin
|
||||||
.csum_buf = vi->csum_buf,
|
.csum_buf = vi->csum_buf,
|
||||||
.dyn_data = vi->dyn_data,
|
.dyn_data = vi->dyn_data,
|
||||||
};
|
};
|
||||||
int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd);
|
int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.read_journal_fd : dsk.read_data_fd);
|
||||||
uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset);
|
uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset);
|
||||||
uint32_t d_pos = 0;
|
uint32_t d_pos = 0;
|
||||||
for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX)
|
for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX)
|
||||||
|
@ -702,7 +702,7 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
|
||||||
BS_SUBMIT_GET_SQE(sqe, data);
|
BS_SUBMIT_GET_SQE(sqe, data);
|
||||||
data->iov = (struct iovec){ buf, dsk.meta_block_size };
|
data->iov = (struct iovec){ buf, dsk.meta_block_size };
|
||||||
PRIV(op)->pending_ops++;
|
PRIV(op)->pending_ops++;
|
||||||
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
|
my_uring_prep_readv(sqe, dsk.read_meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
|
||||||
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
||||||
// return pointer to checksums + bitmap
|
// return pointer to checksums + bitmap
|
||||||
return buf + pos + sizeof(clean_disk_entry);
|
return buf + pos + sizeof(clean_disk_entry);
|
||||||
|
|
|
@ -116,6 +116,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
dsk.parse_config(options);
|
dsk.parse_config(options);
|
||||||
|
dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false;
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
|
@ -479,6 +480,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
|
||||||
{
|
{
|
||||||
blockstore_disk_t dsk;
|
blockstore_disk_t dsk;
|
||||||
dsk.parse_config(options);
|
dsk.parse_config(options);
|
||||||
|
dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false;
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
|
|
|
@ -91,6 +91,7 @@ int disk_tool_t::resize_parse_params()
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
dsk.parse_config(options);
|
dsk.parse_config(options);
|
||||||
|
dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false;
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
|
|
Loading…
Reference in New Issue