diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 579ea7f8..0e225922 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -64,7 +64,7 @@ include_directories( # libvitastor_blk.so add_library(vitastor_blk SHARED - allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp + allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp crc32c.c ringloop.cpp ) target_link_libraries(vitastor_blk diff --git a/src/blockstore_disk.cpp b/src/blockstore_disk.cpp new file mode 100644 index 00000000..422f2626 --- /dev/null +++ b/src/blockstore_disk.cpp @@ -0,0 +1,322 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) + +#include + +#include + +#include "blockstore_impl.h" +#include "blockstore_disk.h" + +static uint32_t is_power_of_two(uint64_t value) +{ + uint32_t l = 0; + while (value > 1) + { + if (value & 1) + { + return 64; + } + value = value >> 1; + l++; + } + return l; +} + +void blockstore_disk_t::parse_config(std::map & config) +{ + // Parse + if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes") + { + disable_flock = true; + } + cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10); + data_device = config["data_device"]; + data_offset = strtoull(config["data_offset"].c_str(), NULL, 10); + cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10); + meta_device = config["meta_device"]; + meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10); + data_block_size = strtoull(config["block_size"].c_str(), NULL, 10); + journal_device = config["journal_device"]; + journal_offset = strtoull(config["journal_offset"].c_str(), NULL, 10); + disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10); + journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10); + meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10); + bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10); + // Validate + if (!data_block_size) + { + data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER); + } + if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE) + { + throw std::runtime_error("Bad block size"); + } + if (!disk_alignment) + { + disk_alignment = 4096; + } + else if (disk_alignment % DIRECT_IO_ALIGNMENT) + { + throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT)); + } + if (!journal_block_size) + { + journal_block_size = 4096; + } + else if (journal_block_size % DIRECT_IO_ALIGNMENT) + { + throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT)); + } + if (!meta_block_size) + { + meta_block_size = 4096; + } + else if (meta_block_size % DIRECT_IO_ALIGNMENT) + { + throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT)); + } + if (data_offset % disk_alignment) + { + throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment)); + } + if (!bitmap_granularity) + { + bitmap_granularity = DEFAULT_BITMAP_GRANULARITY; + } + else if (bitmap_granularity % disk_alignment) + { + throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment)); + } + if (data_block_size % bitmap_granularity) + { + throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity"); + } + if (journal_device == meta_device || meta_device == "" && journal_device == data_device) + { + journal_device = ""; + } + if (meta_device == data_device) + { + meta_device = ""; + } + if (meta_offset % meta_block_size) + { + throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size)); + } + if (journal_offset % journal_block_size) + { + throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size)); + } + clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8; + clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size; +} + +void blockstore_disk_t::calc_lengths() +{ + // data + data_len = data_device_size - data_offset; + if (data_fd == meta_fd && data_offset < meta_offset) + { + data_len = meta_offset - data_offset; + } + if (data_fd == journal_fd && data_offset < journal_offset) + { + data_len = data_len < journal_offset-data_offset + ? data_len : journal_offset-data_offset; + } + if (cfg_data_size != 0) + { + if (data_len < cfg_data_size) + { + throw std::runtime_error("Data area ("+std::to_string(data_len)+ + " bytes) is smaller than configured size ("+std::to_string(cfg_data_size)+" bytes)"); + } + data_len = cfg_data_size; + } + // meta + uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset; + if (meta_fd == data_fd && meta_offset <= data_offset) + { + meta_area_size = data_offset - meta_offset; + } + if (meta_fd == journal_fd && meta_offset <= journal_offset) + { + meta_area_size = meta_area_size < journal_offset-meta_offset + ? meta_area_size : journal_offset-meta_offset; + } + // journal + journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset; + if (journal_fd == data_fd && journal_offset <= data_offset) + { + journal_len = data_offset - journal_offset; + } + if (journal_fd == meta_fd && journal_offset <= meta_offset) + { + journal_len = journal_len < meta_offset-journal_offset + ? journal_len : meta_offset-journal_offset; + } + // required metadata size + block_count = data_len / data_block_size; + meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size; + if (meta_area_size < meta_len) + { + throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes"); + } + // requested journal size + if (cfg_journal_size > journal_len) + { + throw std::runtime_error("Requested journal_size is too large"); + } + else if (cfg_journal_size > 0) + { + journal_len = cfg_journal_size; + } + if (journal_len < MIN_JOURNAL_SIZE) + { + throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes"); + } +} + +static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string name) +{ + int sect; + struct stat st; + if (fstat(fd, &st) < 0) + { + throw std::runtime_error("Failed to stat "+name); + } + if (S_ISREG(st.st_mode)) + { + *size = st.st_size; + if (sectsize) + { + *sectsize = st.st_blksize; + } + } + else if (S_ISBLK(st.st_mode)) + { + if (ioctl(fd, BLKGETSIZE64, size) < 0 || + ioctl(fd, BLKSSZGET, §) < 0) + { + throw std::runtime_error("Failed to get "+name+" size or block size: "+strerror(errno)); + } + if (sectsize) + { + *sectsize = sect; + } + } + else + { + throw std::runtime_error(name+" is neither a file nor a block device"); + } +} + +void blockstore_disk_t::open_data() +{ + data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR); + if (data_fd == -1) + { + throw std::runtime_error("Failed to open data device"); + } + check_size(data_fd, &data_device_size, &data_device_sect, "data device"); + if (disk_alignment % data_device_sect) + { + throw std::runtime_error( + "disk_alignment ("+std::to_string(disk_alignment)+ + ") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")" + ); + } + if (data_offset >= data_device_size) + { + throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size)); + } + if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0) + { + throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno)); + } +} + +void blockstore_disk_t::open_meta() +{ + if (meta_device != "") + { + meta_offset = 0; + meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR); + if (meta_fd == -1) + { + throw std::runtime_error("Failed to open metadata device"); + } + check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device"); + if (meta_offset >= meta_device_size) + { + throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size)); + } + if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0) + { + throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno)); + } + } + else + { + meta_fd = data_fd; + meta_device_sect = data_device_sect; + meta_device_size = 0; + if (meta_offset >= data_device_size) + { + throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_device_size)); + } + } + if (meta_block_size % meta_device_sect) + { + throw std::runtime_error( + "meta_block_size ("+std::to_string(meta_block_size)+ + ") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")" + ); + } +} + +void blockstore_disk_t::open_journal() +{ + if (journal_device != "") + { + journal_fd = open(journal_device.c_str(), O_DIRECT|O_RDWR); + if (journal_fd == -1) + { + throw std::runtime_error("Failed to open journal device"); + } + check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device"); + if (!disable_flock && flock(journal_fd, LOCK_EX|LOCK_NB) != 0) + { + throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno)); + } + } + else + { + journal_fd = meta_fd; + journal_device_sect = meta_device_sect; + journal_device_size = 0; + if (journal_offset >= data_device_size) + { + throw std::runtime_error("journal_offset exceeds device size"); + } + } + if (journal_block_size % journal_device_sect) + { + throw std::runtime_error( + "journal_block_size ("+std::to_string(journal_block_size)+ + ") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")" + ); + } +} + +void blockstore_disk_t::close_all() +{ + if (data_fd >= 0) + close(data_fd); + if (meta_fd >= 0 && meta_fd != data_fd) + close(meta_fd); + if (journal_fd >= 0 && journal_fd != meta_fd) + close(journal_fd); + data_fd = meta_fd = journal_fd = -1; +} diff --git a/src/blockstore_disk.h b/src/blockstore_disk.h new file mode 100644 index 00000000..eb356a41 --- /dev/null +++ b/src/blockstore_disk.h @@ -0,0 +1,42 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) + +#pragma once + +#include + +#include +#include + +struct blockstore_disk_t +{ + std::string data_device, meta_device, journal_device; + uint32_t data_block_size; + uint64_t cfg_journal_size, cfg_data_size; + // Required write alignment and journal/metadata/data areas' location alignment + uint32_t disk_alignment = 4096; + // Journal block size - minimum_io_size of the journal device is the best choice + uint64_t journal_block_size = 4096; + // Metadata block size - minimum_io_size of the metadata device is the best choice + uint64_t meta_block_size = 4096; + // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment + uint64_t bitmap_granularity = 4096; + // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking + bool disable_flock = false; + + int meta_fd = -1, data_fd = -1, journal_fd = -1; + uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len; + uint64_t data_offset, data_device_sect, data_device_size, data_len; + uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len; + + uint32_t block_order; + uint64_t block_count; + uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0; + + void parse_config(std::map & config); + void open_data(); + void open_meta(); + void open_journal(); + void calc_lengths(); + void close_all(); +}; diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index 7fcbc745..36aff9ab 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -15,11 +15,11 @@ journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs) active_flushers = 0; syncing_flushers = 0; // FIXME: allow to configure flusher_start_threshold and journal_trim_interval - flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable); + flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable); journal_trim_interval = 512; journal_trim_counter = bs->journal.flush_journal ? 1 : 0; trim_wanted = bs->journal.flush_journal ? 1 : 0; - journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size); + journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size); co = new journal_flusher_co[max_flusher_count]; for (int i = 0; i < max_flusher_count; i++) { @@ -486,28 +486,28 @@ resume_1: bs->ringloop->wakeup(); } // Reads completed, submit writes and set bitmap bits - if (bs->clean_entry_bitmap_size) + if (bs->dsk.clean_entry_bitmap_size) { new_clean_bitmap = (bs->inmemory_meta - ? (uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry) - : (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size)); + ? (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry) + : (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->dsk.block_order)*(2*bs->dsk.clean_entry_bitmap_size)); if (clean_init_bitmap) { - memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size); - bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->bitmap_granularity); + memset(new_clean_bitmap, 0, bs->dsk.clean_entry_bitmap_size); + bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->dsk.bitmap_granularity); } } for (it = v.begin(); it != v.end(); it++) { if (new_clean_bitmap) { - bitmap_set(new_clean_bitmap, it->offset, it->len, bs->bitmap_granularity); + bitmap_set(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity); } await_sqe(4); data->iov = (struct iovec){ it->buf, (size_t)it->len }; data->callback = simple_callback_w; my_uring_prep_writev( - sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset + sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + clean_loc + it->offset ); wait_count++; } @@ -536,35 +536,35 @@ resume_1: return false; } // zero out old metadata entry - memset((uint8_t*)meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size); + memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size); await_sqe(15); - data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size }; + data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size }; data->callback = simple_callback_w; my_uring_prep_writev( - sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector + sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + meta_old.sector ); wait_count++; } if (has_delete) { - clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size); + clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size); if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid) { printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx\n", - clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, + clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version, cur.oid.inode, cur.oid.stripe); exit(1); } // zero out new metadata entry - memset((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size); + memset((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size); } else { - clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size); + clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size); if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid) { printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n", - clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version, + clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version); exit(1); } @@ -572,20 +572,20 @@ resume_1: new_entry->version = cur.version; if (!bs->inmemory_meta) { - memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size); + memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_entry_bitmap_size); } // copy latest external bitmap/attributes - if (bs->clean_entry_bitmap_size) + if (bs->dsk.clean_entry_bitmap_size) { - void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap; - memcpy((uint8_t*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size); + void *bmp_ptr = bs->dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap; + memcpy((uint8_t*)(new_entry+1) + bs->dsk.clean_entry_bitmap_size, bmp_ptr, bs->dsk.clean_entry_bitmap_size); } } await_sqe(6); - data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size }; + data->iov = (struct iovec){ meta_new.buf, bs->dsk.meta_block_size }; data->callback = simple_callback_w; my_uring_prep_writev( - sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector + sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + meta_new.sector ); wait_count++; resume_7: @@ -669,9 +669,9 @@ resume_1: .version = JOURNAL_VERSION, }; ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock); - data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size }; + data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size }; data->callback = simple_callback_w; - my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); + my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset); wait_count++; resume_13: if (wait_count > 0) @@ -682,7 +682,7 @@ resume_1: if (!bs->disable_journal_fsync) { await_sqe(20); - my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = simple_callback_w; resume_21: @@ -774,7 +774,7 @@ bool journal_flusher_co::scan_dirty(int wait_base) data->iov = (struct iovec){ it->buf, (size_t)submit_len }; data->callback = simple_callback_r; my_uring_prep_readv( - sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset + sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + submit_offset ); wait_count++; } @@ -825,8 +825,8 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_ // And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot, // so I'll avoid it as long as I can. wr.submitted = false; - wr.sector = ((meta_loc >> bs->block_order) / (bs->meta_block_size / bs->clean_entry_size)) * bs->meta_block_size; - wr.pos = ((meta_loc >> bs->block_order) % (bs->meta_block_size / bs->clean_entry_size)); + wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size; + wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)); if (bs->inmemory_meta) { wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector; @@ -836,20 +836,20 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_ if (wr.it == flusher->meta_sectors.end()) { // Not in memory yet, read it - wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->meta_block_size); + wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->dsk.meta_block_size); wr.it = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){ .offset = wr.sector, - .len = bs->meta_block_size, + .len = bs->dsk.meta_block_size, .state = 0, // 0 = not read yet .buf = wr.buf, .usage_count = 1, }).first; await_sqe(0); - data->iov = (struct iovec){ wr.it->second.buf, bs->meta_block_size }; + data->iov = (struct iovec){ wr.it->second.buf, bs->dsk.meta_block_size }; data->callback = simple_callback_r; wr.submitted = true; my_uring_prep_readv( - sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector + sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + wr.sector ); wait_count++; } @@ -867,11 +867,11 @@ void journal_flusher_co::update_clean_db() { #ifdef BLOCKSTORE_DEBUG printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n", - old_clean_loc >> bs->block_order, + old_clean_loc >> bs->dsk.block_order, cur.oid.inode, cur.oid.stripe, cur.version, - clean_loc >> bs->block_order); + clean_loc >> bs->dsk.block_order); #endif - bs->data_alloc->set(old_clean_loc >> bs->block_order, false); + bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false); } auto & clean_db = bs->clean_db_shard(cur.oid); if (has_delete) @@ -880,10 +880,10 @@ void journal_flusher_co::update_clean_db() clean_db.erase(clean_it); #ifdef BLOCKSTORE_DEBUG printf("Free block %lu from %lx:%lx v%lu (delete)\n", - clean_loc >> bs->block_order, + clean_loc >> bs->dsk.block_order, cur.oid.inode, cur.oid.stripe, cur.version); #endif - bs->data_alloc->set(clean_loc >> bs->block_order, false); + bs->data_alloc->set(clean_loc >> bs->dsk.block_order, false); clean_loc = UINT64_MAX; } else @@ -932,7 +932,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base) await_sqe(0); data->iov = { 0 }; data->callback = simple_callback_w; - my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, fsync_meta ? bs->dsk.meta_fd : bs->dsk.data_fd, IORING_FSYNC_DATASYNC); cur_sync->state = 1; wait_count++; resume_2: diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index f10a5e86..dfb0835b 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -11,25 +11,19 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t * ring_consumer.loop = [this]() { loop(); }; ringloop->register_consumer(&ring_consumer); initialized = 0; - data_fd = meta_fd = journal.fd = -1; parse_config(config); - zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, data_block_size); + zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size); try { - open_data(); - open_meta(); - open_journal(); + dsk.open_data(); + dsk.open_meta(); + dsk.open_journal(); calc_lengths(); - data_alloc = new allocator(block_count); + data_alloc = new allocator(dsk.block_count); } catch (std::exception & e) { - if (data_fd >= 0) - close(data_fd); - if (meta_fd >= 0 && meta_fd != data_fd) - close(meta_fd); - if (journal.fd >= 0 && journal.fd != meta_fd) - close(journal.fd); + dsk.close_all(); throw; } flusher = new journal_flusher_t(this); @@ -41,12 +35,7 @@ blockstore_impl_t::~blockstore_impl_t() delete flusher; free(zero_object); ringloop->unregister_consumer(&ring_consumer); - if (data_fd >= 0) - close(data_fd); - if (meta_fd >= 0 && meta_fd != data_fd) - close(meta_fd); - if (journal.fd >= 0 && journal.fd != meta_fd) - close(journal.fd); + dsk.close_all(); if (metadata_buffer) free(metadata_buffer); if (clean_bitmap) @@ -343,9 +332,9 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op) { if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX || ((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && ( - op->offset >= data_block_size || - op->len > data_block_size-op->offset || - (op->len % disk_alignment) + op->offset >= dsk.data_block_size || + op->len > dsk.data_block_size-op->offset || + (op->len % dsk.disk_alignment) )) || readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST) { diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index 51f61be4..fab3665a 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -4,6 +4,7 @@ #pragma once #include "blockstore.h" +#include "blockstore_disk.h" #include #include @@ -218,23 +219,10 @@ struct pool_shard_settings_t class blockstore_impl_t { + blockstore_disk_t dsk; + /******* OPTIONS *******/ - std::string data_device, meta_device, journal_device; - uint32_t data_block_size; - uint64_t meta_offset; - uint64_t data_offset; - uint64_t cfg_journal_size, cfg_data_size; - // Required write alignment and journal/metadata/data areas' location alignment - uint32_t disk_alignment = 4096; - // Journal block size - minimum_io_size of the journal device is the best choice - uint64_t journal_block_size = 4096; - // Metadata block size - minimum_io_size of the metadata device is the best choice - uint64_t meta_block_size = 4096; - // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment - uint64_t bitmap_granularity = 4096; bool readonly = false; - // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking - bool disable_flock = false; // It is safe to disable fsync() if drive write cache is writethrough bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false; // Enable if you want every operation to be executed with an "implicit fsync" @@ -269,16 +257,6 @@ class blockstore_impl_t allocator *data_alloc = NULL; uint8_t *zero_object; - uint32_t block_order; - uint64_t block_count; - uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0; - - int meta_fd; - int data_fd; - uint64_t meta_device_size, meta_len; - uint64_t data_device_size, data_len; - uint64_t data_device_sect, meta_device_sect, journal_device_sect; - void *metadata_buffer = NULL; struct journal_t journal; @@ -395,9 +373,9 @@ public: // Print diagnostics to stdout void dump_diagnostics(); - inline uint32_t get_block_size() { return data_block_size; } - inline uint64_t get_block_count() { return block_count; } + inline uint32_t get_block_size() { return dsk.data_block_size; } + inline uint64_t get_block_count() { return dsk.block_count; } inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); } - inline uint32_t get_bitmap_granularity() { return disk_alignment; } - inline uint64_t get_journal_size() { return journal.len; } + inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; } + inline uint64_t get_journal_size() { return dsk.journal_len; } }; diff --git a/src/blockstore_init.cpp b/src/blockstore_init.cpp index f9c88681..2dad747a 100644 --- a/src/blockstore_init.cpp +++ b/src/blockstore_init.cpp @@ -57,9 +57,9 @@ int blockstore_init_meta::loop() throw std::runtime_error("Failed to allocate metadata read buffer"); // Read superblock GET_SQE(); - data->iov = { metadata_buffer, bs->meta_block_size }; + data->iov = { metadata_buffer, bs->dsk.meta_block_size }; data->callback = [this](ring_data_t *data) { handle_event(data); }; - my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset); + my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset); bs->ringloop->submit(); submitted = 1; resume_1: @@ -68,16 +68,16 @@ resume_1: wait_state = 1; return 1; } - if (iszero((uint64_t*)metadata_buffer, bs->meta_block_size / sizeof(uint64_t))) + if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t))) { { blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)metadata_buffer; hdr->zero = 0; hdr->magic = BLOCKSTORE_META_MAGIC_V1; hdr->version = BLOCKSTORE_META_VERSION_V1; - hdr->meta_block_size = bs->meta_block_size; - hdr->data_block_size = bs->data_block_size; - hdr->bitmap_granularity = bs->bitmap_granularity; + hdr->meta_block_size = bs->dsk.meta_block_size; + hdr->data_block_size = bs->dsk.data_block_size; + hdr->bitmap_granularity = bs->dsk.bitmap_granularity; } if (bs->readonly) { @@ -87,9 +87,9 @@ resume_1: { printf("Initializing metadata area\n"); GET_SQE(); - data->iov = (struct iovec){ metadata_buffer, bs->meta_block_size }; + data->iov = (struct iovec){ metadata_buffer, bs->dsk.meta_block_size }; data->callback = [this](ring_data_t *data) { handle_event(data); }; - my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset); + my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset); bs->ringloop->submit(); submitted = 1; resume_3: @@ -115,23 +115,23 @@ resume_1: ); exit(1); } - if (hdr->meta_block_size != bs->meta_block_size || - hdr->data_block_size != bs->data_block_size || - hdr->bitmap_granularity != bs->bitmap_granularity) + if (hdr->meta_block_size != bs->dsk.meta_block_size || + hdr->data_block_size != bs->dsk.data_block_size || + hdr->bitmap_granularity != bs->dsk.bitmap_granularity) { printf( "Configuration stored in metadata superblock" " (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)" " differs from OSD configuration (%lu/%u/%lu).\n", hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity, - bs->meta_block_size, bs->data_block_size, bs->bitmap_granularity + bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity ); exit(1); } } // Skip superblock - bs->meta_offset += bs->meta_block_size; - bs->meta_len -= bs->meta_block_size; + bs->dsk.meta_offset += bs->dsk.meta_block_size; + bs->dsk.meta_len -= bs->dsk.meta_block_size; prev_done = 0; done_len = 0; done_pos = 0; @@ -145,23 +145,23 @@ resume_1: wait_state = 2; return 1; } - if (metadata_read < bs->meta_len) + if (metadata_read < bs->dsk.meta_len) { GET_SQE(); data->iov = { (uint8_t*)metadata_buffer + (bs->inmemory_meta ? metadata_read : (prev == 1 ? bs->metadata_buf_size : 0)), - bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read, + bs->dsk.meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->dsk.meta_len - metadata_read, }; data->callback = [this](ring_data_t *data) { handle_event(data); }; if (!zero_on_init) - my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); + my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + metadata_read); else { // Fill metadata with zeroes memset(data->iov.iov_base, 0, data->iov.iov_len); - my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); + my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + metadata_read); } bs->ringloop->submit(); submitted = (prev == 1 ? 2 : 1); @@ -172,11 +172,11 @@ resume_1: void *done_buf = bs->inmemory_meta ? ((uint8_t*)metadata_buffer + done_pos) : ((uint8_t*)metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0)); - unsigned count = bs->meta_block_size / bs->clean_entry_size; - for (int sector = 0; sector < done_len; sector += bs->meta_block_size) + unsigned count = bs->dsk.meta_block_size / bs->dsk.clean_entry_size; + for (int sector = 0; sector < done_len; sector += bs->dsk.meta_block_size) { // handle entries - handle_entries((uint8_t*)done_buf + sector, count, bs->block_order); + handle_entries((uint8_t*)done_buf + sector, count, bs->dsk.block_order); done_cnt += count; } prev_done = 0; @@ -188,7 +188,7 @@ resume_1: } } // metadata read finished - printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count); + printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count); if (!bs->inmemory_meta) { free(metadata_buffer); @@ -197,7 +197,7 @@ resume_1: if (zero_on_init && !bs->disable_meta_fsync) { GET_SQE(); - my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, bs->dsk.meta_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = [this](ring_data_t *data) { handle_event(data); }; submitted = 1; @@ -216,10 +216,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo { for (unsigned i = 0; i < count; i++) { - clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->clean_entry_size); - if (!bs->inmemory_meta && bs->clean_entry_bitmap_size) + clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->dsk.clean_entry_size); + if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size) { - memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size); + memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2*bs->dsk.clean_entry_bitmap_size); } if (entry->oid.inode > 0) { @@ -240,7 +240,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo } else { - bs->inode_space_stats[entry->oid.inode] += bs->data_block_size; + bs->inode_space_stats[entry->oid.inode] += bs->dsk.data_block_size; } entries_loaded++; #ifdef BLOCKSTORE_DEBUG @@ -328,7 +328,7 @@ int blockstore_init_journal::loop() data = ((ring_data_t*)sqe->user_data); data->iov = { submitted_buf, bs->journal.block_size }; data->callback = simple_callback; - my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); + my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset); bs->ringloop->submit(); wait_count = 1; resume_1: @@ -367,7 +367,7 @@ resume_1: GET_SQE(); data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size }; data->callback = simple_callback; - my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); + my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset); wait_count++; bs->ringloop->submit(); resume_6: @@ -379,7 +379,7 @@ resume_1: if (!bs->disable_journal_fsync) { GET_SQE(); - my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = simple_callback; wait_count++; @@ -448,7 +448,7 @@ resume_1: end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE, }; data->callback = [this](ring_data_t *data1) { handle_event(data1); }; - my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos); + my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos); bs->ringloop->submit(); } while (done.size() > 0) @@ -463,7 +463,7 @@ resume_1: GET_SQE(); data->iov = { init_write_buf, bs->journal.block_size }; data->callback = simple_callback; - my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector); + my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + init_write_sector); wait_count++; bs->ringloop->submit(); resume_7: @@ -477,7 +477,7 @@ resume_1: GET_SQE(); data->iov = { 0 }; data->callback = simple_callback; - my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC); wait_count++; bs->ringloop->submit(); } @@ -544,7 +544,7 @@ resume_1: ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start) : bs->journal.used_start - bs->journal.next_free), bs->journal.used_start, bs->journal.next_free, - bs->data_alloc->get_free_count(), bs->block_count + bs->data_alloc->get_free_count(), bs->dsk.block_count ); bs->journal.crc32_last = crc32_last; return 0; @@ -669,9 +669,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u }; void *bmp = NULL; void *bmp_from = (uint8_t*)je + sizeof(journal_entry_small_write); - if (bs->clean_entry_bitmap_size <= sizeof(void*)) + if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*)) { - memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size); + memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size); } else { @@ -679,8 +679,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u // allocations for entry bitmaps. This can only be fixed by using // a patched map with dynamic entry size, but not the btree_map, // because it doesn't keep iterators valid all the time. - bmp = malloc_or_die(bs->clean_entry_bitmap_size); - memcpy(bmp, bmp_from, bs->clean_entry_bitmap_size); + bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size); + memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size); } bs->dirty_db.emplace(ov, (dirty_entry){ .state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED), @@ -712,7 +712,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u printf( "je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n", je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "", - je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order + je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->dsk.block_order ); #endif auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){ @@ -750,9 +750,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u }; void *bmp = NULL; void *bmp_from = (uint8_t*)je + sizeof(journal_entry_big_write); - if (bs->clean_entry_bitmap_size <= sizeof(void*)) + if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*)) { - memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size); + memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size); } else { @@ -760,8 +760,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u // allocations for entry bitmaps. This can only be fixed by using // a patched map with dynamic entry size, but not the btree_map, // because it doesn't keep iterators valid all the time. - bmp = malloc_or_die(bs->clean_entry_bitmap_size); - memcpy(bmp, bmp_from, bs->clean_entry_bitmap_size); + bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size); + memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size); } auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){ .state = (BS_ST_BIG_WRITE | BS_ST_SYNCED), @@ -772,7 +772,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u .journal_sector = proc_pos, .bitmap = bmp, }).first; - if (bs->data_alloc->get(je->big_write.location >> bs->block_order)) + if (bs->data_alloc->get(je->big_write.location >> bs->dsk.block_order)) { // This is probably a big_write that's already flushed and freed, but it may // also indicate a bug. So we remember such entries and recheck them afterwards. @@ -785,11 +785,11 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u #ifdef BLOCKSTORE_DEBUG printf( "Allocate block (journal) %lu: %lx:%lx v%lu\n", - je->big_write.location >> bs->block_order, + je->big_write.location >> bs->dsk.block_order, ov.oid.inode, ov.oid.stripe, ov.version ); #endif - bs->data_alloc->set(je->big_write.location >> bs->block_order, true); + bs->data_alloc->set(je->big_write.location >> bs->dsk.block_order, true); } bs->journal.used_sectors[proc_pos]++; #ifdef BLOCKSTORE_DEBUG @@ -913,8 +913,8 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator if (exists && clean_loc == UINT64_MAX) { auto & sp = bs->inode_space_stats[oid.inode]; - if (sp > bs->data_block_size) - sp -= bs->data_block_size; + if (sp > bs->dsk.data_block_size) + sp -= bs->dsk.data_block_size; else bs->inode_space_stats.erase(oid.inode); } diff --git a/src/blockstore_journal.cpp b/src/blockstore_journal.cpp index 59977201..3c8699d9 100644 --- a/src/blockstore_journal.cpp +++ b/src/blockstore_journal.cpp @@ -175,7 +175,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_ }; data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); }; my_uring_prep_writev( - sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset + sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset ); } journal.sector_info[cur_sector].dirty = false; diff --git a/src/blockstore_journal.h b/src/blockstore_journal.h index 0f401ff1..1023fd8b 100644 --- a/src/blockstore_journal.h +++ b/src/blockstore_journal.h @@ -164,7 +164,6 @@ inline bool operator < (const pending_journaling_t & a, const pending_journaling struct journal_t { int fd; - uint64_t device_size; bool inmemory = false; bool flush_journal = false; void *buffer = NULL; diff --git a/src/blockstore_open.cpp b/src/blockstore_open.cpp index 05f0902d..adcfea7c 100644 --- a/src/blockstore_open.cpp +++ b/src/blockstore_open.cpp @@ -4,23 +4,10 @@ #include #include "blockstore_impl.h" -static uint32_t is_power_of_two(uint64_t value) -{ - uint32_t l = 0; - while (value > 1) - { - if (value & 1) - { - return 64; - } - value = value >> 1; - l++; - } - return l; -} - void blockstore_impl_t::parse_config(blockstore_config_t & config) { + // Common disk options + dsk.parse_config(config); // Parse if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes") { @@ -38,10 +25,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) { disable_journal_fsync = true; } - if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes") - { - disable_flock = true; - } if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes") { // Only flush journal and exit @@ -56,24 +39,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) immediate_commit = IMMEDIATE_SMALL; } metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10); - cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10); - data_device = config["data_device"]; - data_offset = strtoull(config["data_offset"].c_str(), NULL, 10); - cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10); - meta_device = config["meta_device"]; - meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10); - data_block_size = strtoull(config["block_size"].c_str(), NULL, 10); inmemory_meta = config["inmemory_metadata"] != "false"; - journal_device = config["journal_device"]; - journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10); journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10); journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" || config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes"; journal.inmemory = config["inmemory_journal"] != "false"; - disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10); - journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10); - meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10); - bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10); max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10); if (!max_flusher_count) max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10); @@ -85,14 +55,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10); throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10); // Validate - if (!data_block_size) - { - data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER); - } - if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE) - { - throw std::runtime_error("Bad block size"); - } if (!max_flusher_count) { max_flusher_count = 256; @@ -105,62 +67,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) { max_write_iodepth = 128; } - if (!disk_alignment) - { - disk_alignment = 4096; - } - else if (disk_alignment % DIRECT_IO_ALIGNMENT) - { - throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT)); - } - if (!journal_block_size) - { - journal_block_size = 4096; - } - else if (journal_block_size % DIRECT_IO_ALIGNMENT) - { - throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT)); - } - if (!meta_block_size) - { - meta_block_size = 4096; - } - else if (meta_block_size % DIRECT_IO_ALIGNMENT) - { - throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT)); - } - if (data_offset % disk_alignment) - { - throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment)); - } - if (!bitmap_granularity) - { - bitmap_granularity = DEFAULT_BITMAP_GRANULARITY; - } - else if (bitmap_granularity % disk_alignment) - { - throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment)); - } - if (data_block_size % bitmap_granularity) - { - throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity"); - } - if (journal_device == meta_device || meta_device == "" && journal_device == data_device) - { - journal_device = ""; - } - if (meta_device == data_device) - { - meta_device = ""; - } - if (meta_offset % meta_block_size) - { - throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size)); - } - if (journal.offset % journal_block_size) - { - throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size)); - } if (journal.sector_count < 2) { journal.sector_count = 32; @@ -169,11 +75,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) { metadata_buf_size = 4*1024*1024; } - if (meta_device == "") + if (dsk.meta_device == "") { disable_meta_fsync = disable_data_fsync; } - if (journal_device == "") + if (dsk.journal_device == "") { disable_journal_fsync = disable_meta_fsync; } @@ -202,238 +108,46 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) throttle_threshold_us = 50; } // init some fields - clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8; - clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size; - journal.block_size = journal_block_size; - journal.next_free = journal_block_size; - journal.used_start = journal_block_size; + journal.block_size = dsk.journal_block_size; + journal.next_free = dsk.journal_block_size; + journal.used_start = dsk.journal_block_size; // no free space because sector is initially unmapped - journal.in_sector_pos = journal_block_size; + journal.in_sector_pos = dsk.journal_block_size; } void blockstore_impl_t::calc_lengths() { - // data - data_len = data_device_size - data_offset; - if (data_fd == meta_fd && data_offset < meta_offset) - { - data_len = meta_offset - data_offset; - } - if (data_fd == journal.fd && data_offset < journal.offset) - { - data_len = data_len < journal.offset-data_offset - ? data_len : journal.offset-data_offset; - } - if (cfg_data_size != 0) - { - if (data_len < cfg_data_size) - { - throw std::runtime_error("Data area ("+std::to_string(data_len)+ - " bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)"); - } - data_len = cfg_data_size; - } - // meta - uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset; - if (meta_fd == data_fd && meta_offset <= data_offset) - { - meta_area_size = data_offset - meta_offset; - } - if (meta_fd == journal.fd && meta_offset <= journal.offset) - { - meta_area_size = meta_area_size < journal.offset-meta_offset - ? meta_area_size : journal.offset-meta_offset; - } - // journal - journal.len = (journal.fd == data_fd ? data_device_size : (journal.fd == meta_fd ? meta_device_size : journal.device_size)) - journal.offset; - if (journal.fd == data_fd && journal.offset <= data_offset) - { - journal.len = data_offset - journal.offset; - } - if (journal.fd == meta_fd && journal.offset <= meta_offset) - { - journal.len = journal.len < meta_offset-journal.offset - ? journal.len : meta_offset-journal.offset; - } - // required metadata size - block_count = data_len / data_block_size; - meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size; - if (meta_area_size < meta_len) - { - throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes"); - } + dsk.calc_lengths(); + journal.len = dsk.journal_len; + journal.block_size = dsk.journal_block_size; + journal.offset = dsk.journal_offset; if (inmemory_meta) { - metadata_buffer = memalign(MEM_ALIGNMENT, meta_len); + metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len); if (!metadata_buffer) throw std::runtime_error("Failed to allocate memory for the metadata"); } - else if (clean_entry_bitmap_size) + else if (dsk.clean_entry_bitmap_size) { - clean_bitmap = (uint8_t*)malloc(block_count * 2*clean_entry_bitmap_size); + clean_bitmap = (uint8_t*)malloc(dsk.block_count * 2*dsk.clean_entry_bitmap_size); if (!clean_bitmap) throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap"); } - // requested journal size - if (cfg_journal_size > journal.len) - { - throw std::runtime_error("Requested journal_size is too large"); - } - else if (cfg_journal_size > 0) - { - journal.len = cfg_journal_size; - } - if (journal.len < MIN_JOURNAL_SIZE) - { - throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes"); - } if (journal.inmemory) { journal.buffer = memalign(MEM_ALIGNMENT, journal.len); if (!journal.buffer) throw std::runtime_error("Failed to allocate memory for journal"); } -} - -static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string name) -{ - int sect; - struct stat st; - if (fstat(fd, &st) < 0) - { - throw std::runtime_error("Failed to stat "+name); - } - if (S_ISREG(st.st_mode)) - { - *size = st.st_size; - if (sectsize) - { - *sectsize = st.st_blksize; - } - } - else if (S_ISBLK(st.st_mode)) - { - if (ioctl(fd, BLKGETSIZE64, size) < 0 || - ioctl(fd, BLKSSZGET, §) < 0) - { - throw std::runtime_error("Failed to get "+name+" size or block size: "+strerror(errno)); - } - if (sectsize) - { - *sectsize = sect; - } - } else { - throw std::runtime_error(name+" is neither a file nor a block device"); - } -} - -void blockstore_impl_t::open_data() -{ - data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR); - if (data_fd == -1) - { - throw std::runtime_error("Failed to open data device"); - } - check_size(data_fd, &data_device_size, &data_device_sect, "data device"); - if (disk_alignment % data_device_sect) - { - throw std::runtime_error( - "disk_alignment ("+std::to_string(disk_alignment)+ - ") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")" - ); - } - if (data_offset >= data_device_size) - { - throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size)); - } - if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0) - { - throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno)); - } -} - -void blockstore_impl_t::open_meta() -{ - if (meta_device != "") - { - meta_offset = 0; - meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR); - if (meta_fd == -1) - { - throw std::runtime_error("Failed to open metadata device"); - } - check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device"); - if (meta_offset >= meta_device_size) - { - throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size)); - } - if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0) - { - throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno)); - } - } - else - { - meta_fd = data_fd; - meta_device_sect = data_device_sect; - meta_device_size = 0; - if (meta_offset >= data_device_size) - { - throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_device_size)); - } - } - if (meta_block_size % meta_device_sect) - { - throw std::runtime_error( - "meta_block_size ("+std::to_string(meta_block_size)+ - ") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")" - ); - } -} - -void blockstore_impl_t::open_journal() -{ - if (journal_device != "") - { - journal.fd = open(journal_device.c_str(), O_DIRECT|O_RDWR); - if (journal.fd == -1) - { - throw std::runtime_error("Failed to open journal device"); - } - check_size(journal.fd, &journal.device_size, &journal_device_sect, "journal device"); - if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0) - { - throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno)); - } - } - else - { - journal.fd = meta_fd; - journal_device_sect = meta_device_sect; - journal.device_size = 0; - if (journal.offset >= data_device_size) - { - throw std::runtime_error("journal_offset exceeds device size"); - } + journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * dsk.journal_block_size); + if (!journal.sector_buf) + throw std::bad_alloc(); } journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t)); if (!journal.sector_info) { throw std::bad_alloc(); } - if (!journal.inmemory) - { - journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * journal_block_size); - if (!journal.sector_buf) - throw std::bad_alloc(); - } - if (journal_block_size % journal_device_sect) - { - throw std::runtime_error( - "journal_block_size ("+std::to_string(journal_block_size)+ - ") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")" - ); - } } diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index eb53cf5a..b50f4b54 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -32,9 +32,9 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_ PRIV(op)->pending_ops++; my_uring_prep_readv( sqe, - IS_JOURNAL(item_state) ? journal.fd : data_fd, + IS_JOURNAL(item_state) ? dsk.journal_fd : dsk.data_fd, &data->iov, 1, - (IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset + (IS_JOURNAL(item_state) ? dsk.journal_offset : dsk.data_offset) + offset ); data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; return 1; @@ -97,15 +97,15 @@ endwhile: uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset) { uint8_t *clean_entry_bitmap; - uint64_t meta_loc = block_loc >> block_order; + uint64_t meta_loc = block_loc >> dsk.block_order; if (inmemory_meta) { - uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size; - uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size)); - clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset); + uint64_t sector = (meta_loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size; + uint64_t pos = (meta_loc % (dsk.meta_block_size / dsk.clean_entry_size)); + clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset); } else - clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset); + clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*dsk.clean_entry_bitmap_size + offset); return clean_entry_bitmap; } @@ -152,8 +152,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) result_version = dirty_it->first.version; if (read_op->bitmap) { - void *bmp_ptr = (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap); - memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size); + void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap); + memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size); } } if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len, @@ -178,15 +178,15 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) result_version = clean_it->second.version; if (read_op->bitmap) { - void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); - memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size); + void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size); + memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size); } } if (fulfilled < read_op->len) { - if (!clean_entry_bitmap_size) + if (!dsk.clean_entry_bitmap_size) { - if (!fulfill_read(read_op, fulfilled, 0, data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location)) + if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location)) { // need to wait. undo added requests, don't dequeue op PRIV(read_op)->read_vec.clear(); @@ -196,7 +196,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) else { uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0); - uint64_t bmp_start = 0, bmp_end = 0, bmp_size = data_block_size/bitmap_granularity; + uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity; while (bmp_start < bmp_size) { while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size) @@ -206,8 +206,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) if (bmp_end > bmp_start) { // fill with zeroes - assert(fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, - bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0)); + assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, + bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0)); } bmp_start = bmp_end; while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size) @@ -216,9 +216,9 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) } if (bmp_end > bmp_start) { - if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, - bmp_end * bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, - clean_it->second.location + bmp_start * bitmap_granularity)) + if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity, + bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, + clean_it->second.location + bmp_start * dsk.bitmap_granularity)) { // need to wait. undo added requests, don't dequeue op PRIV(read_op)->read_vec.clear(); @@ -233,7 +233,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) else if (fulfilled < read_op->len) { // fill remaining parts with zeroes - assert(fulfill_read(read_op, fulfilled, 0, data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0)); + assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0)); } assert(fulfilled == read_op->len); read_op->version = result_version; @@ -288,8 +288,8 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void *result_version = dirty_it->first.version; if (bitmap) { - void *bmp_ptr = (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap); - memcpy(bitmap, bmp_ptr, clean_entry_bitmap_size); + void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap); + memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size); } return 0; } @@ -306,14 +306,14 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void *result_version = clean_it->second.version; if (bitmap) { - void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); - memcpy(bitmap, bmp_ptr, clean_entry_bitmap_size); + void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size); + memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size); } return 0; } if (result_version) *result_version = 0; if (bitmap) - memset(bitmap, 0, clean_entry_bitmap_size); + memset(bitmap, 0, dsk.clean_entry_bitmap_size); return -ENOENT; } diff --git a/src/blockstore_rollback.cpp b/src/blockstore_rollback.cpp index 3349c761..f07655da 100644 --- a/src/blockstore_rollback.cpp +++ b/src/blockstore_rollback.cpp @@ -112,7 +112,7 @@ resume_2: if (!disable_journal_fsync) { BS_SUBMIT_GET_SQE(sqe, data); - my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; @@ -217,10 +217,10 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, dirty_it->second.location != UINT64_MAX) { #ifdef BLOCKSTORE_DEBUG - printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> block_order, + printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> dsk.block_order, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version); #endif - data_alloc->set(dirty_it->second.location >> block_order, false); + data_alloc->set(dirty_it->second.location >> dsk.block_order, false); } int used = --journal.used_sectors[dirty_it->second.journal_sector]; #ifdef BLOCKSTORE_DEBUG @@ -233,7 +233,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, { journal.used_sectors.erase(dirty_it->second.journal_sector); } - if (clean_entry_bitmap_size > sizeof(void*)) + if (dsk.clean_entry_bitmap_size > sizeof(void*)) { free(dirty_it->second.bitmap); dirty_it->second.bitmap = NULL; diff --git a/src/blockstore_stable.cpp b/src/blockstore_stable.cpp index 3a91b8d2..34be29ec 100644 --- a/src/blockstore_stable.cpp +++ b/src/blockstore_stable.cpp @@ -137,7 +137,7 @@ resume_2: if (!disable_journal_fsync) { BS_SUBMIT_GET_SQE(sqe, data); - my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; @@ -195,14 +195,14 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty) } if (!exists) { - inode_space_stats[dirty_it->first.oid.inode] += data_block_size; + inode_space_stats[dirty_it->first.oid.inode] += dsk.data_block_size; } } else if (IS_DELETE(dirty_it->second.state)) { auto & sp = inode_space_stats[dirty_it->first.oid.inode]; - if (sp > data_block_size) - sp -= data_block_size; + if (sp > dsk.data_block_size) + sp -= dsk.data_block_size; else inode_space_stats.erase(dirty_it->first.oid.inode); } diff --git a/src/blockstore_sync.cpp b/src/blockstore_sync.cpp index ce238780..ddacff16 100644 --- a/src/blockstore_sync.cpp +++ b/src/blockstore_sync.cpp @@ -60,7 +60,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog if (!disable_data_fsync) { BS_SUBMIT_GET_SQE(sqe, data); - my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; @@ -79,7 +79,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog // Check space in the journal and journal memory buffers blockstore_journal_check_t space_check(this); if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), - sizeof(journal_entry_big_write) + clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION)) + sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION)) { return 0; } @@ -90,7 +90,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog int s = 0; while (it != PRIV(op)->sync_big_writes.end()) { - if (!journal.entry_fits(sizeof(journal_entry_big_write) + clean_entry_bitmap_size) && + if (!journal.entry_fits(sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size) && journal.sector_info[journal.cur_sector].dirty) { prepare_journal_sector_write(journal.cur_sector, op); @@ -99,7 +99,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog auto & dirty_entry = dirty_db.at(*it); journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry( journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE, - sizeof(journal_entry_big_write) + clean_entry_bitmap_size + sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size ); dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; @@ -115,8 +115,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog je->offset = dirty_entry.offset; je->len = dirty_entry.len; je->location = dirty_entry.location; - memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) - ? dirty_entry.bitmap : &dirty_entry.bitmap), clean_entry_bitmap_size); + memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*) + ? dirty_entry.bitmap : &dirty_entry.bitmap), dsk.clean_entry_bitmap_size); je->crc32 = je_crc32((journal_entry*)je); journal.crc32_last = je->crc32; it++; @@ -132,7 +132,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog if (!disable_journal_fsync) { BS_SUBMIT_GET_SQE(sqe, data); - my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index e1a633af..233d7733 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -10,9 +10,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) bool wait_big = false, wait_del = false; void *bmp = NULL; uint64_t version = 1; - if (!is_del && clean_entry_bitmap_size > sizeof(void*)) + if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*)) { - bmp = calloc_or_die(1, clean_entry_bitmap_size); + bmp = calloc_or_die(1, dsk.clean_entry_bitmap_size); } if (dirty_db.size() > 0) { @@ -32,8 +32,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG); if (!is_del && !deleted) { - if (clean_entry_bitmap_size > sizeof(void*)) - memcpy(bmp, dirty_it->second.bitmap, clean_entry_bitmap_size); + if (dsk.clean_entry_bitmap_size > sizeof(void*)) + memcpy(bmp, dirty_it->second.bitmap, dsk.clean_entry_bitmap_size); else bmp = dirty_it->second.bitmap; } @@ -48,8 +48,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) version = clean_it->second.version + 1; if (!is_del) { - void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); - memcpy((clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, clean_entry_bitmap_size); + void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size); + memcpy((dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, dsk.clean_entry_bitmap_size); } } else @@ -90,14 +90,14 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) { // Invalid version requested op->retval = -EEXIST; - if (!is_del && clean_entry_bitmap_size > sizeof(void*)) + if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*)) { free(bmp); } return false; } } - if (wait_big && !is_del && !deleted && op->len < data_block_size && + if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && immediate_commit != IMMEDIATE_ALL) { // Issue an additional sync so that the previous big write can reach the journal @@ -122,7 +122,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) state = BS_ST_DELETE | BS_ST_IN_FLIGHT; else { - state = (op->len == data_block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE); + state = (op->len == dsk.data_block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE); if (state == BS_ST_SMALL_WRITE && throttle_small_writes) clock_gettime(CLOCK_REALTIME, &PRIV(op)->tv_begin); if (wait_del) @@ -136,9 +136,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) if (op->bitmap) { // Only allow to overwrite part of the object bitmap respective to the write's offset/len - uint8_t *bmp_ptr = (uint8_t*)(clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp); - uint32_t bit = op->offset/bitmap_granularity; - uint32_t bits_left = op->len/bitmap_granularity; + uint8_t *bmp_ptr = (uint8_t*)(dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp); + uint32_t bit = op->offset/dsk.bitmap_granularity; + uint32_t bits_left = op->len/dsk.bitmap_granularity; while (!(bit % 8) && bits_left > 8) { // Copy bytes @@ -175,7 +175,7 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_ { while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid) { - if (clean_entry_bitmap_size > sizeof(void*)) + if (dsk.clean_entry_bitmap_size > sizeof(void*)) free(dirty_it->second.bitmap); dirty_db.erase(dirty_it++); } @@ -251,7 +251,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) { blockstore_journal_check_t space_check(this); if (!space_check.check_available(op, unsynced_big_write_count + 1, - sizeof(journal_entry_big_write) + clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION)) + sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION)) { return 0; } @@ -271,7 +271,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) } BS_SUBMIT_GET_SQE(sqe, data); write_iodepth++; - dirty_it->second.location = loc << block_order; + dirty_it->second.location = loc << dsk.block_order; dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED; #ifdef BLOCKSTORE_DEBUG printf( @@ -280,9 +280,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) ); #endif data_alloc->set(loc, true); - uint64_t stripe_offset = (op->offset % bitmap_granularity); - uint64_t stripe_end = (op->offset + op->len) % bitmap_granularity; - // Zero fill up to bitmap_granularity + uint64_t stripe_offset = (op->offset % dsk.bitmap_granularity); + uint64_t stripe_end = (op->offset + op->len) % dsk.bitmap_granularity; + // Zero fill up to dsk.bitmap_granularity int vcnt = 0; if (stripe_offset) { @@ -291,13 +291,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len }; if (stripe_end) { - stripe_end = bitmap_granularity - stripe_end; + stripe_end = dsk.bitmap_granularity - stripe_end; PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end }; } data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; my_uring_prep_writev( - sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset + sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset ); PRIV(op)->pending_ops = 1; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; @@ -319,9 +319,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) blockstore_journal_check_t space_check(this); if (unsynced_big_write_count && !space_check.check_available(op, unsynced_big_write_count, - sizeof(journal_entry_big_write) + clean_entry_bitmap_size, 0) + sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0) || !space_check.check_available(op, 1, - sizeof(journal_entry_small_write) + clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION)) + sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION)) { return 0; } @@ -329,7 +329,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) BS_SUBMIT_CHECK_SQES( // Write current journal sector only if it's dirty and full, or in the immediate_commit mode (immediate_commit != IMMEDIATE_NONE || - !journal.entry_fits(sizeof(journal_entry_small_write) + clean_entry_bitmap_size) ? 1 : 0) + + !journal.entry_fits(sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size) ? 1 : 0) + (op->len > 0 ? 1 : 0) ); write_iodepth++; @@ -337,7 +337,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); }; if (immediate_commit == IMMEDIATE_NONE) { - if (!journal.entry_fits(sizeof(journal_entry_small_write) + clean_entry_bitmap_size)) + if (!journal.entry_fits(sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size)) { prepare_journal_sector_write(journal.cur_sector, op); } @@ -349,7 +349,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) // Then pre-fill journal entry journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry( journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE, - sizeof(journal_entry_small_write) + clean_entry_bitmap_size + sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size ); dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; @@ -361,14 +361,14 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) ); #endif // Figure out where data will be - journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size; + journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size; je->oid = op->oid; je->version = op->version; je->offset = op->offset; je->len = op->len; je->data_offset = journal.next_free; je->crc32_data = crc32c(0, op->buf, op->len); - memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size); + memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), dsk.clean_entry_bitmap_size); je->crc32 = je_crc32((journal_entry*)je); journal.crc32_last = je->crc32; if (immediate_commit != IMMEDIATE_NONE) @@ -387,7 +387,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) data2->iov = (struct iovec){ op->buf, op->len }; data2->callback = cb; my_uring_prep_writev( - sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free + sqe2, dsk.journal_fd, &data2->iov, 1, journal.offset + journal.next_free ); PRIV(op)->pending_ops++; } @@ -400,7 +400,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) journal.next_free += op->len; if (journal.next_free >= journal.len) { - journal.next_free = journal_block_size; + journal.next_free = dsk.journal_block_size; } if (!PRIV(op)->pending_ops) { @@ -440,7 +440,7 @@ resume_2: assert(dirty_it != dirty_db.end()); journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry( journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE, - sizeof(journal_entry_big_write) + clean_entry_bitmap_size + sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size ); dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; @@ -456,7 +456,7 @@ resume_2: je->offset = op->offset; je->len = op->len; je->location = dirty_it->second.location; - memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size); + memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), dsk.clean_entry_bitmap_size); je->crc32 = je_crc32((journal_entry*)je); journal.crc32_last = je->crc32; prepare_journal_sector_write(journal.cur_sector, op); @@ -634,7 +634,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op) // Write current journal sector only if it's dirty and full, or in the immediate_commit mode BS_SUBMIT_CHECK_SQES( (immediate_commit != IMMEDIATE_NONE || - (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) && + (dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) && journal.sector_info[journal.cur_sector].dirty) ? 1 : 0 ); if (write_iodepth >= max_write_iodepth) @@ -645,7 +645,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op) // Prepare journal sector write if (immediate_commit == IMMEDIATE_NONE) { - if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) && + if ((dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) && journal.sector_info[journal.cur_sector].dirty) { prepare_journal_sector_write(journal.cur_sector, op);