From 98d584919037b60b42b97f014484e37ae6295633 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 14 Jul 2023 23:32:07 +0300 Subject: [PATCH] Support using Linux page cache for reads --- src/blockstore_disk.cpp | 74 +++++++++++++++++++++++++++++++++++++++ src/blockstore_disk.h | 3 ++ src/blockstore_flush.cpp | 6 ++-- src/blockstore_init.cpp | 10 +++--- src/blockstore_read.cpp | 6 ++-- src/disk_tool_prepare.cpp | 2 ++ src/disk_tool_resize.cpp | 1 + 7 files changed, 91 insertions(+), 11 deletions(-) diff --git a/src/blockstore_disk.cpp b/src/blockstore_disk.cpp index 415b167a..f80bdeeb 100644 --- a/src/blockstore_disk.cpp +++ b/src/blockstore_disk.cpp @@ -45,6 +45,13 @@ void blockstore_disk_t::parse_config(std::map & config meta_block_size = parse_size(config["meta_block_size"]); bitmap_granularity = parse_size(config["bitmap_granularity"]); meta_format = stoull_full(config["meta_format"]); + cached_read_data = config["cached_read_data"] == "true" || config["cached_read_data"] == "yes" || config["cached_read_data"] == "1"; + cached_read_meta = cached_read_data && (meta_device == data_device || meta_device == "") && + config.find("cached_read_meta") == config.end() || + config["cached_read_meta"] == "true" || config["cached_read_meta"] == "yes" || config["cached_read_meta"] == "1"; + cached_read_journal = cached_read_meta && (journal_device == meta_device || journal_device == "") && + config.find("cached_read_journal") == config.end() || + config["cached_read_journal"] == "true" || config["cached_read_journal"] == "yes" || config["cached_read_journal"] == "1"; if (config["data_csum_type"] == "crc32c") { data_csum_type = BLOCKSTORE_CSUM_CRC32C; @@ -288,6 +295,18 @@ void blockstore_disk_t::open_data() { throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno)); } + if (cached_read_data) + { + read_data_fd = open(data_device.c_str(), O_RDWR); + if (read_data_fd == -1) + { + throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno))); + } + } + else + { + read_data_fd = data_fd; + } } void blockstore_disk_t::open_meta() @@ -308,6 +327,18 @@ void blockstore_disk_t::open_meta() { throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno)); } + if (cached_read_meta) + { + read_meta_fd = open(meta_device.c_str(), O_RDWR); + if (read_meta_fd == -1) + { + throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno))); + } + } + else + { + read_meta_fd = meta_fd; + } } else { @@ -326,6 +357,22 @@ void blockstore_disk_t::open_meta() ") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")" ); } + if (!cached_read_meta) + { + read_meta_fd = meta_fd; + } + else if (meta_device == data_device && cached_read_data) + { + read_meta_fd = read_data_fd; + } + else + { + read_meta_fd = open(meta_device.c_str(), O_RDWR); + if (read_meta_fd == -1) + { + throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno))); + } + } } void blockstore_disk_t::open_journal() @@ -360,6 +407,26 @@ void blockstore_disk_t::open_journal() ") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")" ); } + if (!cached_read_journal) + { + read_journal_fd = journal_fd; + } + else if (journal_device == meta_device && cached_read_meta) + { + read_journal_fd = read_meta_fd; + } + else if (journal_device == data_device && cached_read_data) + { + read_journal_fd = read_data_fd; + } + else + { + read_journal_fd = open(journal_device.c_str(), O_RDWR); + if (read_journal_fd == -1) + { + throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno))); + } + } } void blockstore_disk_t::close_all() @@ -370,5 +437,12 @@ void blockstore_disk_t::close_all() close(meta_fd); if (journal_fd >= 0 && journal_fd != meta_fd) close(journal_fd); + if (read_data_fd >= 0 && read_data_fd != data_fd) + close(read_data_fd); + if (read_meta_fd >= 0 && read_meta_fd != meta_fd) + close(read_meta_fd); + if (read_journal_fd >= 0 && read_journal_fd != journal_fd) + close(read_journal_fd); data_fd = meta_fd = journal_fd = -1; + read_data_fd = read_meta_fd = read_journal_fd = -1; } diff --git a/src/blockstore_disk.h b/src/blockstore_disk.h index 5426de0e..402300bc 100644 --- a/src/blockstore_disk.h +++ b/src/blockstore_disk.h @@ -31,8 +31,11 @@ struct blockstore_disk_t uint32_t csum_block_size = 4096; // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking bool disable_flock = false; + // Use linux page cache for reads. If enabled, separate buffered FDs will be opened for reading + bool cached_read_data = false, cached_read_meta = false, cached_read_journal = false; int meta_fd = -1, data_fd = -1, journal_fd = -1; + int read_meta_fd = -1, read_data_fd = -1, read_journal_fd = -1; uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0; uint64_t data_offset, data_device_sect, data_device_size, data_len; uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len; diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index 57125eff..f16117c0 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -1087,7 +1087,7 @@ bool journal_flusher_co::read_dirty(int wait_base) data->iov = (struct iovec){ vi.buf, vi.len }; data->callback = simple_callback_r; my_uring_prep_readv( - sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset + sqe, bs->dsk.read_data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset ); wait_count++; bs->find_holes(v, vi.offset, vi.offset+vi.len, [this, buf = (uint8_t*)vi.buf-vi.offset](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end) @@ -1119,7 +1119,7 @@ bool journal_flusher_co::read_dirty(int wait_base) data->iov = (struct iovec){ v[i].buf, (size_t)v[i].len }; data->callback = simple_callback_rj; my_uring_prep_readv( - sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset + sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset ); wait_journal_count++; } @@ -1212,7 +1212,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_ data->callback = simple_callback_r; wr.submitted = true; my_uring_prep_readv( - sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector + sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector ); wait_count++; } diff --git a/src/blockstore_init.cpp b/src/blockstore_init.cpp index be32b588..c8f7a734 100644 --- a/src/blockstore_init.cpp +++ b/src/blockstore_init.cpp @@ -65,7 +65,7 @@ int blockstore_init_meta::loop() GET_SQE(); data->iov = { metadata_buffer, bs->dsk.meta_block_size }; data->callback = [this](ring_data_t *data) { handle_event(data, -1); }; - my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset); + my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset); bs->ringloop->submit(); submitted++; resume_1: @@ -202,7 +202,7 @@ resume_2: data->iov = { bufs[i].buf, bufs[i].size }; data->callback = [this, i](ring_data_t *data) { handle_event(data, i); }; if (!zero_on_init) - my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset); + my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset); else { // Fill metadata with zeroes @@ -259,7 +259,7 @@ resume_2: GET_SQE(); data->iov = { metadata_buffer, bs->dsk.meta_block_size }; data->callback = [this](ring_data_t *data) { handle_event(data, -1); }; - my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size); + my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size); submitted++; resume_5: if (submitted > 0) @@ -467,7 +467,7 @@ int blockstore_init_journal::loop() data = ((ring_data_t*)sqe->user_data); data->iov = { submitted_buf, bs->journal.block_size }; data->callback = simple_callback; - my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset); + my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset); bs->ringloop->submit(); wait_count = 1; resume_1: @@ -607,7 +607,7 @@ resume_1: end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE, }; data->callback = [this](ring_data_t *data1) { handle_event(data1); }; - my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos); + my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + journal_pos); bs->ringloop->submit(); } while (done.size() > 0) diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index 73c11465..62179e7f 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -29,7 +29,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_ PRIV(op)->pending_ops++; my_uring_prep_readv( sqe, - IS_JOURNAL(item_state) ? dsk.journal_fd : dsk.data_fd, + IS_JOURNAL(item_state) ? dsk.read_journal_fd : dsk.read_data_fd, &data->iov, 1, (IS_JOURNAL(item_state) ? dsk.journal_offset : dsk.data_offset) + offset ); @@ -348,7 +348,7 @@ bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uin .csum_buf = vi->csum_buf, .dyn_data = vi->dyn_data, }; - int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd); + int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.read_journal_fd : dsk.read_data_fd); uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset); uint32_t d_pos = 0; for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX) @@ -702,7 +702,7 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t BS_SUBMIT_GET_SQE(sqe, data); data->iov = (struct iovec){ buf, dsk.meta_block_size }; PRIV(op)->pending_ops++; - my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector); + my_uring_prep_readv(sqe, dsk.read_meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector); data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; // return pointer to checksums + bitmap return buf + pos + sizeof(clean_disk_entry); diff --git a/src/disk_tool_prepare.cpp b/src/disk_tool_prepare.cpp index 54d05a05..3fe18818 100644 --- a/src/disk_tool_prepare.cpp +++ b/src/disk_tool_prepare.cpp @@ -116,6 +116,7 @@ int disk_tool_t::prepare_one(std::map options, int is_ try { dsk.parse_config(options); + dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false; dsk.open_data(); dsk.open_meta(); dsk.open_journal(); @@ -479,6 +480,7 @@ int disk_tool_t::get_meta_partition(std::vector & ssds, std { blockstore_disk_t dsk; dsk.parse_config(options); + dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false; dsk.open_data(); dsk.open_meta(); dsk.open_journal(); diff --git a/src/disk_tool_resize.cpp b/src/disk_tool_resize.cpp index d0b32060..1c567718 100644 --- a/src/disk_tool_resize.cpp +++ b/src/disk_tool_resize.cpp @@ -91,6 +91,7 @@ int disk_tool_t::resize_parse_params() try { dsk.parse_config(options); + dsk.cached_read_data = dsk.cached_read_meta = dsk.cached_read_journal = false; dsk.open_data(); dsk.open_meta(); dsk.open_journal();