Compare commits

..

No commits in common. "data-resize-tool" and "master" have entirely different histories.

18 changed files with 459 additions and 1009 deletions

View File

@ -1,3 +1,3 @@
usr/bin/vitastor-osd usr/bin/vitastor-osd
usr/bin/vitastor-disk
usr/bin/vitastor-dump-journal usr/bin/vitastor-dump-journal
usr/bin/vitastor-dump-meta

View File

@ -112,8 +112,8 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd %files -n vitastor-osd
%_bindir/vitastor-osd %_bindir/vitastor-osd
%_bindir/vitastor-disk
%_bindir/vitastor-dump-journal %_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon %files -n vitastor-mon

View File

@ -109,8 +109,8 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd %files -n vitastor-osd
%_bindir/vitastor-osd %_bindir/vitastor-osd
%_bindir/vitastor-disk
%_bindir/vitastor-dump-journal %_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon %files -n vitastor-mon

View File

@ -193,13 +193,14 @@ target_link_libraries(vitastor-cli
) )
configure_file(vitastor.pc.in vitastor.pc @ONLY) configure_file(vitastor.pc.in vitastor.pc @ONLY)
# vitastor-disk # vitastor-dump-journal
add_executable(vitastor-disk add_executable(vitastor-dump-journal
disk_tool.cpp crc32c.c rw_blocking.cpp allocator.cpp ringloop.cpp dump_journal.cpp crc32c.c
) )
target_link_libraries(vitastor-disk
tcmalloc_minimal # vitastor-dump-meta
${LIBURING_LIBRARIES} add_executable(vitastor-dump-meta
dump_meta.cpp rw_blocking.cpp
) )
if (${WITH_QEMU}) if (${WITH_QEMU})
@ -279,8 +280,7 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
### Install ### Install
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) install(TARGETS vitastor-osd vitastor-dump-journal vitastor-dump-meta vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-disk ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-dump-journal)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm) install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita) install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
install( install(

View File

@ -29,9 +29,9 @@
#endif #endif
// Default block size is 128 KB, current allowed range is 4K - 128M // Default block size is 128 KB, current allowed range is 4K - 128M
#define DEFAULT_DATA_BLOCK_ORDER 17 #define DEFAULT_ORDER 17
#define MIN_DATA_BLOCK_SIZE 4*1024 #define MIN_BLOCK_SIZE 4*1024
#define MAX_DATA_BLOCK_SIZE 128*1024*1024 #define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_BITMAP_GRANULARITY 4096 #define DEFAULT_BITMAP_GRANULARITY 4096
#define BS_OP_MIN 1 #define BS_OP_MIN 1
@ -193,6 +193,7 @@ public:
// Print diagnostics to stdout // Print diagnostics to stdout
void dump_diagnostics(); void dump_diagnostics();
// FIXME rename to object_size
uint32_t get_block_size(); uint32_t get_block_size();
uint64_t get_block_count(); uint64_t get_block_count();
uint64_t get_free_block_count(); uint64_t get_free_block_count();

View File

@ -13,7 +13,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
initialized = 0; initialized = 0;
data_fd = meta_fd = journal.fd = -1; data_fd = meta_fd = journal.fd = -1;
parse_config(config); parse_config(config);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, data_block_size); zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
try try
{ {
open_data(); open_data();
@ -343,8 +343,8 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
{ {
if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX || if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && ( ((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && (
op->offset >= data_block_size || op->offset >= block_size ||
op->len > data_block_size-op->offset || op->len > block_size-op->offset ||
(op->len % disk_alignment) (op->len % disk_alignment)
)) || )) ||
readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST) readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST)
@ -477,7 +477,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
uint64_t min_inode = op->oid.inode; uint64_t min_inode = op->oid.inode;
uint64_t max_inode = op->version; uint64_t max_inode = op->version;
// Check PG // Check PG
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count)) if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
{ {
op->retval = -EINVAL; op->retval = -EINVAL;
FINISH_OP(op); FINISH_OP(op);

View File

@ -219,7 +219,7 @@ class blockstore_impl_t
{ {
/******* OPTIONS *******/ /******* OPTIONS *******/
std::string data_device, meta_device, journal_device; std::string data_device, meta_device, journal_device;
uint32_t data_block_size; uint32_t block_size;
uint64_t meta_offset; uint64_t meta_offset;
uint64_t data_offset; uint64_t data_offset;
uint64_t cfg_journal_size, cfg_data_size; uint64_t cfg_journal_size, cfg_data_size;
@ -274,8 +274,8 @@ class blockstore_impl_t
int meta_fd; int meta_fd;
int data_fd; int data_fd;
uint64_t meta_device_size, meta_len; uint64_t meta_size, meta_area, meta_len;
uint64_t data_device_size, data_len; uint64_t data_size, data_len;
uint64_t data_device_sect, meta_device_sect, journal_device_sect; uint64_t data_device_sect, meta_device_sect, journal_device_sect;
void *metadata_buffer = NULL; void *metadata_buffer = NULL;
@ -394,7 +394,7 @@ public:
// Print diagnostics to stdout // Print diagnostics to stdout
void dump_diagnostics(); void dump_diagnostics();
inline uint32_t get_block_size() { return data_block_size; } inline uint32_t get_block_size() { return block_size; }
inline uint64_t get_block_count() { return block_count; } inline uint64_t get_block_count() { return block_count; }
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); } inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
inline uint32_t get_bitmap_granularity() { return disk_alignment; } inline uint32_t get_bitmap_granularity() { return disk_alignment; }

View File

@ -76,7 +76,7 @@ resume_1:
hdr->magic = BLOCKSTORE_META_MAGIC_V1; hdr->magic = BLOCKSTORE_META_MAGIC_V1;
hdr->version = BLOCKSTORE_META_VERSION_V1; hdr->version = BLOCKSTORE_META_VERSION_V1;
hdr->meta_block_size = bs->meta_block_size; hdr->meta_block_size = bs->meta_block_size;
hdr->data_block_size = bs->data_block_size; hdr->data_block_size = bs->block_size;
hdr->bitmap_granularity = bs->bitmap_granularity; hdr->bitmap_granularity = bs->bitmap_granularity;
} }
if (bs->readonly) if (bs->readonly)
@ -116,7 +116,7 @@ resume_1:
exit(1); exit(1);
} }
if (hdr->meta_block_size != bs->meta_block_size || if (hdr->meta_block_size != bs->meta_block_size ||
hdr->data_block_size != bs->data_block_size || hdr->data_block_size != bs->block_size ||
hdr->bitmap_granularity != bs->bitmap_granularity) hdr->bitmap_granularity != bs->bitmap_granularity)
{ {
printf( printf(
@ -124,7 +124,7 @@ resume_1:
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)" " (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)"
" differs from OSD configuration (%lu/%u/%lu).\n", " differs from OSD configuration (%lu/%u/%lu).\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity, hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
bs->meta_block_size, bs->data_block_size, bs->bitmap_granularity bs->meta_block_size, bs->block_size, bs->bitmap_granularity
); );
exit(1); exit(1);
} }
@ -240,7 +240,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
} }
else else
{ {
bs->inode_space_stats[entry->oid.inode] += bs->data_block_size; bs->inode_space_stats[entry->oid.inode] += bs->block_size;
} }
entries_loaded++; entries_loaded++;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
@ -913,8 +913,8 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
if (exists && clean_loc == UINT64_MAX) if (exists && clean_loc == UINT64_MAX)
{ {
auto & sp = bs->inode_space_stats[oid.inode]; auto & sp = bs->inode_space_stats[oid.inode];
if (sp > bs->data_block_size) if (sp > bs->block_size)
sp -= bs->data_block_size; sp -= bs->block_size;
else else
bs->inode_space_stats.erase(oid.inode); bs->inode_space_stats.erase(oid.inode);
} }

View File

@ -62,7 +62,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10); cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"]; meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10); meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
data_block_size = strtoull(config["block_size"].c_str(), NULL, 10); block_size = strtoull(config["block_size"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false"; inmemory_meta = config["inmemory_metadata"] != "false";
journal_device = config["journal_device"]; journal_device = config["journal_device"];
journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10); journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
@ -85,11 +85,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10); throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10); throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
// Validate // Validate
if (!data_block_size) if (!block_size)
{ {
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER); block_size = (1 << DEFAULT_ORDER);
} }
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE) if ((block_order = is_power_of_two(block_size)) >= 64 || block_size < MIN_BLOCK_SIZE || block_size >= MAX_BLOCK_SIZE)
{ {
throw std::runtime_error("Bad block size"); throw std::runtime_error("Bad block size");
} }
@ -141,7 +141,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{ {
throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment)); throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
} }
if (data_block_size % bitmap_granularity) if (block_size % bitmap_granularity)
{ {
throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity"); throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
} }
@ -202,7 +202,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
throttle_threshold_us = 50; throttle_threshold_us = 50;
} }
// init some fields // init some fields
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8; clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size; clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
journal.block_size = journal_block_size; journal.block_size = journal_block_size;
journal.next_free = journal_block_size; journal.next_free = journal_block_size;
@ -214,7 +214,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
void blockstore_impl_t::calc_lengths() void blockstore_impl_t::calc_lengths()
{ {
// data // data
data_len = data_device_size - data_offset; data_len = data_size - data_offset;
if (data_fd == meta_fd && data_offset < meta_offset) if (data_fd == meta_fd && data_offset < meta_offset)
{ {
data_len = meta_offset - data_offset; data_len = meta_offset - data_offset;
@ -234,18 +234,18 @@ void blockstore_impl_t::calc_lengths()
data_len = cfg_data_size; data_len = cfg_data_size;
} }
// meta // meta
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset; meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
if (meta_fd == data_fd && meta_offset <= data_offset) if (meta_fd == data_fd && meta_offset <= data_offset)
{ {
meta_area_size = data_offset - meta_offset; meta_area = data_offset - meta_offset;
} }
if (meta_fd == journal.fd && meta_offset <= journal.offset) if (meta_fd == journal.fd && meta_offset <= journal.offset)
{ {
meta_area_size = meta_area_size < journal.offset-meta_offset meta_area = meta_area < journal.offset-meta_offset
? meta_area_size : journal.offset-meta_offset; ? meta_area : journal.offset-meta_offset;
} }
// journal // journal
journal.len = (journal.fd == data_fd ? data_device_size : (journal.fd == meta_fd ? meta_device_size : journal.device_size)) - journal.offset; journal.len = (journal.fd == data_fd ? data_size : (journal.fd == meta_fd ? meta_size : journal.device_size)) - journal.offset;
if (journal.fd == data_fd && journal.offset <= data_offset) if (journal.fd == data_fd && journal.offset <= data_offset)
{ {
journal.len = data_offset - journal.offset; journal.len = data_offset - journal.offset;
@ -256,9 +256,9 @@ void blockstore_impl_t::calc_lengths()
? journal.len : meta_offset-journal.offset; ? journal.len : meta_offset-journal.offset;
} }
// required metadata size // required metadata size
block_count = data_len / data_block_size; block_count = data_len / block_size;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size; meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
if (meta_area_size < meta_len) if (meta_area < meta_len)
{ {
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes"); throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
} }
@ -316,7 +316,7 @@ static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string n
if (ioctl(fd, BLKGETSIZE64, size) < 0 || if (ioctl(fd, BLKGETSIZE64, size) < 0 ||
ioctl(fd, BLKSSZGET, &sect) < 0) ioctl(fd, BLKSSZGET, &sect) < 0)
{ {
throw std::runtime_error("Failed to get "+name+" size or block size: "+strerror(errno)); throw std::runtime_error("failed to get "+name+" size or block size: "+strerror(errno));
} }
if (sectsize) if (sectsize)
{ {
@ -336,7 +336,7 @@ void blockstore_impl_t::open_data()
{ {
throw std::runtime_error("Failed to open data device"); throw std::runtime_error("Failed to open data device");
} }
check_size(data_fd, &data_device_size, &data_device_sect, "data device"); check_size(data_fd, &data_size, &data_device_sect, "data device");
if (disk_alignment % data_device_sect) if (disk_alignment % data_device_sect)
{ {
throw std::runtime_error( throw std::runtime_error(
@ -344,9 +344,9 @@ void blockstore_impl_t::open_data()
") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")" ") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")"
); );
} }
if (data_offset >= data_device_size) if (data_offset >= data_size)
{ {
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size)); throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
} }
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0) if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
{ {
@ -364,10 +364,10 @@ void blockstore_impl_t::open_meta()
{ {
throw std::runtime_error("Failed to open metadata device"); throw std::runtime_error("Failed to open metadata device");
} }
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device"); check_size(meta_fd, &meta_size, &meta_device_sect, "metadata device");
if (meta_offset >= meta_device_size) if (meta_offset >= meta_size)
{ {
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size)); throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
} }
if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0) if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{ {
@ -378,10 +378,10 @@ void blockstore_impl_t::open_meta()
{ {
meta_fd = data_fd; meta_fd = data_fd;
meta_device_sect = data_device_sect; meta_device_sect = data_device_sect;
meta_device_size = 0; meta_size = 0;
if (meta_offset >= data_device_size) if (meta_offset >= data_size)
{ {
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_device_size)); throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_size));
} }
} }
if (meta_block_size % meta_device_sect) if (meta_block_size % meta_device_sect)
@ -413,7 +413,7 @@ void blockstore_impl_t::open_journal()
journal.fd = meta_fd; journal.fd = meta_fd;
journal_device_sect = meta_device_sect; journal_device_sect = meta_device_sect;
journal.device_size = 0; journal.device_size = 0;
if (journal.offset >= data_device_size) if (journal.offset >= data_size)
{ {
throw std::runtime_error("journal_offset exceeds device size"); throw std::runtime_error("journal_offset exceeds device size");
} }

View File

@ -186,7 +186,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{ {
if (!clean_entry_bitmap_size) if (!clean_entry_bitmap_size)
{ {
if (!fulfill_read(read_op, fulfilled, 0, data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location)) if (!fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location))
{ {
// need to wait. undo added requests, don't dequeue op // need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear(); PRIV(read_op)->read_vec.clear();
@ -196,7 +196,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
else else
{ {
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0); uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0);
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = data_block_size/bitmap_granularity; uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
while (bmp_start < bmp_size) while (bmp_start < bmp_size)
{ {
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size) while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
@ -233,7 +233,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
else if (fulfilled < read_op->len) else if (fulfilled < read_op->len)
{ {
// fill remaining parts with zeroes // fill remaining parts with zeroes
assert(fulfill_read(read_op, fulfilled, 0, data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0)); assert(fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
} }
assert(fulfilled == read_op->len); assert(fulfilled == read_op->len);
read_op->version = result_version; read_op->version = result_version;

View File

@ -195,14 +195,14 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
} }
if (!exists) if (!exists)
{ {
inode_space_stats[dirty_it->first.oid.inode] += data_block_size; inode_space_stats[dirty_it->first.oid.inode] += block_size;
} }
} }
else if (IS_DELETE(dirty_it->second.state)) else if (IS_DELETE(dirty_it->second.state))
{ {
auto & sp = inode_space_stats[dirty_it->first.oid.inode]; auto & sp = inode_space_stats[dirty_it->first.oid.inode];
if (sp > data_block_size) if (sp > block_size)
sp -= data_block_size; sp -= block_size;
else else
inode_space_stats.erase(dirty_it->first.oid.inode); inode_space_stats.erase(dirty_it->first.oid.inode);
} }

View File

@ -97,7 +97,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
return false; return false;
} }
} }
if (wait_big && !is_del && !deleted && op->len < data_block_size && if (wait_big && !is_del && !deleted && op->len < block_size &&
immediate_commit != IMMEDIATE_ALL) immediate_commit != IMMEDIATE_ALL)
{ {
// Issue an additional sync so that the previous big write can reach the journal // Issue an additional sync so that the previous big write can reach the journal
@ -122,7 +122,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
state = BS_ST_DELETE | BS_ST_IN_FLIGHT; state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
else else
{ {
state = (op->len == data_block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE); state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
if (state == BS_ST_SMALL_WRITE && throttle_small_writes) if (state == BS_ST_SMALL_WRITE && throttle_small_writes)
clock_gettime(CLOCK_REALTIME, &PRIV(op)->tv_begin); clock_gettime(CLOCK_REALTIME, &PRIV(op)->tv_begin);
if (wait_del) if (wait_del)

View File

@ -84,7 +84,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
fprintf(stderr, "Invalid device block size specified: %lu\n", device_block_size); fprintf(stderr, "Invalid device block size specified: %lu\n", device_block_size);
exit(1); exit(1);
} }
if (object_size < device_block_size || object_size > MAX_DATA_BLOCK_SIZE || if (object_size < device_block_size || object_size > MAX_BLOCK_SIZE ||
object_size & (object_size-1) != 0) object_size & (object_size-1) != 0)
{ {
fprintf(stderr, "Invalid object size specified: %lu\n", object_size); fprintf(stderr, "Invalid object size specified: %lu\n", object_size);

View File

@ -296,7 +296,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
} }
bs_bitmap_size = bs_block_size / bs_bitmap_granularity / 8; bs_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
uint32_t block_order; uint32_t block_order;
if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_DATA_BLOCK_SIZE || bs_block_size >= MAX_DATA_BLOCK_SIZE) if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
{ {
throw std::runtime_error("Bad block size"); throw std::runtime_error("Bad block size");
} }

View File

@ -6,8 +6,8 @@
#include "messenger.h" #include "messenger.h"
#include "etcd_state_client.h" #include "etcd_state_client.h"
#define MIN_DATA_BLOCK_SIZE 4*1024 #define MIN_BLOCK_SIZE 4*1024
#define MAX_DATA_BLOCK_SIZE 128*1024*1024 #define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024 #define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024 #define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
#define INODE_LIST_DONE 1 #define INODE_LIST_DONE 1

View File

@ -1,944 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <malloc.h>
#include <linux/fs.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <stdio.h>
#include "blockstore_impl.h"
#include "osd_id.h"
#include "crc32c.h"
#include "rw_blocking.h"
#define DM_ST_EMPTY 0
#define DM_ST_TO_READ 1
#define DM_ST_READING 2
#define DM_ST_TO_WRITE 3
#define DM_ST_WRITING 4
struct resizer_data_moving_t
{
int state = 0;
void *buf = NULL;
uint64_t old_loc, new_loc;
};
struct disk_tool_t
{
/**** Parameters ****/
std::map<std::string, std::string> options;
std::string journal_device;
uint32_t journal_block_size;
uint64_t journal_offset;
uint64_t journal_len;
bool all;
std::string meta_device;
uint32_t meta_block_size;
uint64_t meta_offset;
uint64_t meta_len;
uint64_t meta_pos;
std::string data_device;
uint64_t data_offset;
uint64_t data_len;
uint64_t data_block_size;
uint64_t clean_entry_bitmap_size, clean_entry_size;
uint32_t bitmap_granularity;
// resize data and/or move metadata and journal
int iodepth;
char *new_meta_device, *new_journal_device;
uint64_t new_data_offset, new_data_len;
uint64_t new_journal_offset, new_journal_len;
uint64_t new_meta_offset, new_meta_len;
/**** State ****/
uint64_t journal_pos, journal_calc_data_pos;
int journal_fd, meta_fd;
bool first;
int data_fd;
allocator *data_alloc;
std::map<uint64_t, uint64_t> data_remap;
std::map<uint64_t, uint64_t>::iterator remap_it;
ring_loop_t *ringloop;
ring_consumer_t ring_consumer;
int remap_active;
uint8_t *new_buf, *new_journal_ptr, *new_journal_data;
uint64_t new_journal_in_pos;
int64_t data_idx_diff;
uint64_t total_blocks, free_first, free_last;
uint64_t new_clean_entry_bitmap_size, new_clean_entry_size, new_entries_per_block;
int new_journal_fd, new_meta_fd;
resizer_data_moving_t *moving_blocks;
bool started;
void *small_write_data;
uint32_t data_crc32;
uint32_t crc32_last;
uint32_t new_crc32_prev;
/**** Commands ****/
int dump_journal();
int dump_meta();
int resize_data();
/**** Methods ****/
void dump_journal_entry(int num, journal_entry *je);
int process_journal(std::function<int(void*)> block_fn);
int process_journal_block(void *buf, std::function<void(int, journal_entry*)> iter_fn);
int process_meta(std::function<void(blockstore_meta_header_v1_t *)> hdr_fn,
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn);
void dump_meta_header(blockstore_meta_header_v1_t *hdr);
void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap);
void resize_init(blockstore_meta_header_v1_t *hdr);
int resize_remap_blocks();
int resize_copy_data();
int resize_rewrite_journal();
int resize_rewrite_meta();
};
int main(int argc, char *argv[])
{
disk_tool_t self = {};
std::vector<char*> cmd;
char *exe_name = strrchr(argv[0], '/');
exe_name = exe_name ? exe_name+1 : argv[0];
bool aliased = false;
if (!strcmp(exe_name, "vitastor-dump-journal"))
{
cmd.push_back((char*)"dump-journal");
aliased = true;
}
for (int i = 1; i < argc; i++)
{
if (!strcmp(argv[i], "--all"))
{
self.all = true;
}
else if (!strcmp(argv[i], "--help"))
{
cmd.clear();
cmd.push_back((char*)"help");
}
else if (argv[i][0] == '-' && argv[i][1] == '-')
{
char *key = argv[i]+2;
self.options[key] = argv[++i];
}
else
{
cmd.push_back(argv[i]);
}
}
if (cmd.size() && !strcmp(cmd[0], "dump-journal"))
{
if (cmd.size() < 5)
{
fprintf(stderr, "USAGE: %s%s [--all] <journal_file> <journal_block_size> <offset> <size>\n", argv[0], aliased ? "" : " dump-journal");
return 1;
}
self.journal_device = cmd[1];
self.journal_block_size = strtoul(cmd[2], NULL, 10);
self.journal_offset = strtoull(cmd[3], NULL, 10);
self.journal_len = strtoull(cmd[4], NULL, 10);
return self.dump_journal();
}
else if (cmd.size() && !strcmp(cmd[0], "dump-meta"))
{
if (cmd.size() < 5)
{
fprintf(stderr, "USAGE: %s dump-meta <meta_file> <meta_block_size> <offset> <size>\n", argv[0]);
return 1;
}
self.meta_device = cmd[1];
self.meta_block_size = strtoul(cmd[2], NULL, 10);
self.meta_offset = strtoull(cmd[3], NULL, 10);
self.meta_len = strtoull(cmd[4], NULL, 10);
return self.dump_meta();
}
else if (cmd.size() && !strcmp(cmd[0], "resize"))
{
return self.resize_data();
}
else
{
printf(
"USAGE:\n"
" %s dump-journal [--all] <journal_file> <journal_block_size> <offset> <size>\n"
" %s dump-meta <meta_file> <meta_block_size> <offset> <size>\n"
,
argv[0], argv[0]
);
}
return 0;
}
int disk_tool_t::dump_journal()
{
if (journal_block_size < DIRECT_IO_ALIGNMENT || (journal_block_size % DIRECT_IO_ALIGNMENT) ||
journal_block_size > 128*1024)
{
fprintf(stderr, "Invalid journal block size\n");
return 1;
}
journal_fd = open(journal_device.c_str(), O_DIRECT|O_RDONLY);
if (journal_fd < 0)
{
fprintf(stderr, "Failed to open journal device %s: %s\n", journal_device.c_str(), strerror(errno));
return 1;
}
if (all)
{
void *journal_buf = memalign_or_die(MEM_ALIGNMENT, journal_block_size);
journal_pos = 0;
while (journal_pos < journal_len)
{
int r = pread(journal_fd, journal_buf, journal_block_size, journal_offset+journal_pos);
assert(r == journal_block_size);
uint64_t s;
for (s = 0; s < journal_block_size; s += 8)
{
if (*((uint64_t*)((uint8_t*)journal_buf+s)) != 0)
break;
}
if (s == journal_block_size)
{
printf("offset %08lx: zeroes\n", journal_pos);
journal_pos += journal_block_size;
}
else if (((journal_entry*)journal_buf)->magic == JOURNAL_MAGIC)
{
printf("offset %08lx:\n", journal_pos);
process_journal_block(journal_buf, [this](int num, journal_entry *je) { dump_journal_entry(num, je); });
}
else
{
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", journal_pos, *((uint64_t*)journal_buf));
journal_pos += journal_block_size;
}
}
free(journal_buf);
}
else
{
process_journal([this](void *data)
{
printf("offset %08lx:\n", journal_pos);
int r = process_journal_block(data, [this](int num, journal_entry *je) { dump_journal_entry(num, je); });
if (r <= 0)
printf("end of the journal\n");
return r;
});
}
close(journal_fd);
return 0;
}
int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
{
void *data = memalign_or_die(MEM_ALIGNMENT, journal_block_size);
journal_pos = 0;
int r = pread(journal_fd, data, journal_block_size, journal_offset+journal_pos);
assert(r == journal_block_size);
journal_entry *je = (journal_entry*)(data);
if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
{
fprintf(stderr, "offset %08lx: journal superblock is invalid\n", journal_pos);
return 1;
}
else
{
block_fn(data);
started = false;
journal_pos = je->start.journal_start;
while (1)
{
if (journal_pos >= journal_len)
journal_pos = journal_block_size;
r = pread(journal_fd, data, journal_block_size, journal_offset+journal_pos);
assert(r == journal_block_size);
r = block_fn(data);
if (r <= 0)
break;
}
}
free(data);
return 0;
}
int disk_tool_t::process_journal_block(void *buf, std::function<void(int, journal_entry*)> iter_fn)
{
uint32_t pos = 0;
journal_pos += journal_block_size;
int entry = 0;
bool wrapped = false;
while (pos < journal_block_size)
{
journal_entry *je = (journal_entry*)((uint8_t*)buf + pos);
if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
!all && started && je->crc32_prev != crc32_last)
{
break;
}
bool crc32_valid = je_crc32(je) == je->crc32;
if (!all && !crc32_valid)
{
break;
}
started = true;
crc32_last = je->crc32;
if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
journal_calc_data_pos = journal_pos;
if (journal_pos + je->small_write.len > journal_len)
{
// data continues from the beginning of the journal
journal_calc_data_pos = journal_pos = journal_block_size;
wrapped = true;
}
journal_pos += je->small_write.len;
if (journal_pos >= journal_len)
{
journal_pos = journal_block_size;
wrapped = true;
}
small_write_data = memalign_or_die(MEM_ALIGNMENT, je->small_write.len);
assert(pread(journal_fd, small_write_data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
data_crc32 = crc32c(0, small_write_data, je->small_write.len);
}
iter_fn(entry, je);
if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
free(small_write_data);
small_write_data = NULL;
}
pos += je->size;
entry++;
}
if (wrapped)
{
journal_pos = journal_len;
}
return entry;
}
void disk_tool_t::dump_journal_entry(int num, journal_entry *je)
{
printf("entry % 3d: crc32=%08x %s prev=%08x ", num, je->crc32, (je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)"), je->crc32_prev);
if (je->type == JE_START)
{
printf("je_start start=%08lx\n", je->start.journal_start);
}
else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
printf(
"je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe,
je->small_write.version, je->small_write.offset, je->small_write.len,
je->small_write.data_offset
);
if (journal_calc_data_pos != je->small_write.data_offset)
{
printf(" (mismatched, calculated = %lu)", journal_pos);
}
printf(
" data_crc32=%08x%s", je->small_write.crc32_data,
(data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
);
printf("\n");
}
else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
printf(
"je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n",
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
);
}
else if (je->type == JE_STABLE)
{
printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
}
else if (je->type == JE_ROLLBACK)
{
printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
}
else if (je->type == JE_DELETE)
{
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
}
}
int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)> hdr_fn,
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn)
{
if (meta_block_size % DIRECT_IO_ALIGNMENT)
{
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
return 1;
}
meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDONLY);
if (meta_fd < 0)
{
fprintf(stderr, "Failed to open metadata device %s: %s\n", meta_device.c_str(), strerror(errno));
return 1;
}
int buf_size = 1024*1024;
if (buf_size % meta_block_size)
buf_size = 8*meta_block_size;
if (buf_size > meta_len)
buf_size = meta_len;
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
lseek64(meta_fd, meta_offset, 0);
read_blocking(meta_fd, data, buf_size);
// Check superblock
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
if (hdr->zero == 0 &&
hdr->magic == BLOCKSTORE_META_MAGIC_V1 &&
hdr->version == BLOCKSTORE_META_VERSION_V1)
{
// Vitastor 0.6-0.7 - static array of clean_disk_entry with bitmaps
if (hdr->meta_block_size != meta_block_size)
{
fprintf(stderr, "Using block size of %u bytes based on information from the superblock\n", hdr->meta_block_size);
meta_block_size = hdr->meta_block_size;
if (buf_size % meta_block_size)
{
buf_size = 8*meta_block_size;
free(data);
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
}
}
bitmap_granularity = hdr->bitmap_granularity;
clean_entry_bitmap_size = hdr->data_block_size / hdr->bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
uint64_t block_num = 0;
hdr_fn(hdr);
meta_pos = meta_block_size;
lseek64(meta_fd, meta_offset+meta_pos, 0);
while (meta_pos < meta_len)
{
uint64_t read_len = buf_size < meta_len-meta_pos ? buf_size : meta_len-meta_pos;
read_blocking(meta_fd, data, read_len);
meta_pos += read_len;
for (uint64_t blk = 0; blk < read_len; blk += meta_block_size)
{
for (uint64_t ioff = 0; ioff < meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry *entry = (clean_disk_entry*)(data + blk + ioff);
if (entry->oid.inode)
{
record_fn(block_num, entry, entry->bitmap);
}
}
}
}
}
else
{
// Vitastor 0.4-0.5 - static array of clean_disk_entry
clean_entry_bitmap_size = 0;
clean_entry_size = sizeof(clean_disk_entry);
uint64_t block_num = 0;
hdr_fn(NULL);
while (meta_pos < meta_len)
{
uint64_t read_len = buf_size < meta_len-meta_pos ? buf_size : meta_len-meta_pos;
read_blocking(meta_fd, data, read_len);
meta_pos += read_len;
for (uint64_t blk = 0; blk < read_len; blk += meta_block_size)
{
for (uint64_t ioff = 0; ioff < meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry *entry = (clean_disk_entry*)(data + blk + ioff);
if (entry->oid.inode)
{
record_fn(block_num, entry, NULL);
}
}
}
}
}
free(data);
close(meta_fd);
return 0;
}
int disk_tool_t::dump_meta()
{
int r = process_meta(
[this](blockstore_meta_header_v1_t *hdr) { dump_meta_header(hdr); },
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { dump_meta_entry(block_num, entry, bitmap); }
);
printf("\n]}\n");
return r;
}
void disk_tool_t::dump_meta_header(blockstore_meta_header_v1_t *hdr)
{
if (hdr)
{
printf(
"{\"version\":\"0.6\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity
);
}
else
{
printf("{\"version\":\"0.5\",\"meta_block_size\":%u,\"entries\":[\n", meta_block_size);
}
first = true;
}
void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu"
(first ? ENTRY_FMT : (",\n" ENTRY_FMT)),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
if (bitmap)
{
printf(",\"bitmap\":\"");
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", bitmap[i]);
}
printf("\",\"ext_bitmap\":\"");
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", bitmap[clean_entry_bitmap_size + i]);
}
printf("\"}");
}
else
{
printf("}");
}
first = false;
}
int disk_tool_t::resize_data()
{
int r;
// Parse parameters
r = resize_parse_params();
if (r != 0)
return r;
// Check parameters and fill allocator
r = process_meta(
[this](blockstore_meta_header_v1_t *hdr)
{
resize_init(hdr);
},
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
data_alloc->set(block_num, true);
}
);
if (r != 0)
return r;
r = process_journal([this](void *buf)
{
return process_journal_block(buf, [this](int num, journal_entry *je)
{
if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
data_alloc->set(je->big_write.location / data_block_size, true);
}
});
});
if (r != 0)
return r;
// Remap blocks
r = resize_remap_blocks();
if (r != 0)
return r;
// Copy data blocks into new places
r = resize_copy_data();
if (r != 0)
return r;
// Rewrite journal
r = resize_rewrite_journal();
if (r != 0)
return r;
// Rewrite metadata
r = resize_rewrite_meta();
if (r != 0)
return r;
return 0;
}
int disk_tool_t::resize_parse_params()
{
auto & config = options;
// FIXME: Deduplicate with blockstore_open.cpp !
journal_len = strtoull(config["journal_size"].c_str(), NULL, 10);
data_device = config["data_device"];
data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
data_len = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
data_block_size = strtoull(config["block_size"].c_str(), NULL, 10);
journal_device = config["journal_device"];
journal_offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
iodepth = strtoull(config["iodepth"].c_str(), NULL, 10);
// Validate
if (!data_block_size)
data_block_size = (1 << DEFAULT_ORDER);
if (data_block_size < MIN_BLOCK_SIZE || data_block_size >= MAX_BLOCK_SIZE)
throw std::runtime_error("Bad block size");
if (!journal_block_size)
journal_block_size = 4096;
else if (journal_block_size % DIRECT_IO_ALIGNMENT)
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
if (!meta_block_size)
meta_block_size = 4096;
else if (meta_block_size % DIRECT_IO_ALIGNMENT)
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
if (journal_device == meta_device || meta_device == "" && journal_device == data_device)
journal_device = "";
if (meta_device == data_device)
meta_device = "";
if (meta_offset % meta_block_size)
throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
if (journal_offset % journal_block_size)
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
if (!iodepth)
iodepth = 32;
// Check offsets
}
void disk_tool_t::resize_init(blockstore_meta_header_v1_t *hdr)
{
if (hdr && data_block_size != hdr->data_block_size)
{
if (data_block_size)
{
fprintf(stderr, "Using data block size of %u bytes from metadata superblock\n", hdr->data_block_size);
}
data_block_size = hdr->data_block_size;
}
if (((new_data_len-data_len) % data_block_size) ||
((new_data_offset-data_offset) % data_block_size))
{
fprintf(stderr, "Data alignment mismatch\n");
exit(1);
}
data_idx_diff = (new_data_offset-data_offset) / data_block_size;
free_first = new_data_offset > data_offset ? data_idx_diff : 0;
free_last = (new_data_offset+new_data_len < data_offset+data_len)
? (data_offset+data_len-new_data_offset-new_data_len) / data_block_size
: 0;
new_clean_entry_bitmap_size = data_block_size / (hdr ? hdr->bitmap_granularity : 4096) / 8;
new_clean_entry_size = sizeof(clean_disk_entry) + 2 * new_clean_entry_bitmap_size;
new_entries_per_block = meta_block_size/new_clean_entry_size;
uint64_t new_meta_blocks = 1 + (new_data_len/data_block_size + new_entries_per_block-1) / new_entries_per_block;
if (new_meta_len < meta_block_size*new_meta_blocks)
{
fprintf(stderr, "New metadata area size is too small, should be at least %lu bytes\n", meta_block_size*new_meta_blocks);
exit(1);
}
}
int disk_tool_t::resize_remap_blocks()
{
total_blocks = data_len / data_block_size;
for (uint64_t i = 0; i < free_first; i++)
{
if (data_alloc->get(i))
data_remap[i] = 0;
else
data_alloc->set(i, true);
}
for (uint64_t i = 0; i < free_last; i++)
{
if (data_alloc->get(total_blocks-i))
data_remap[total_blocks-i] = 0;
else
data_alloc->set(total_blocks-i, true);
}
for (auto & p: data_remap)
{
uint64_t new_loc = data_alloc->find_free();
if (new_loc == UINT64_MAX)
{
fprintf(stderr, "Not enough space to move data\n");
return 1;
}
data_remap[p.first] = new_loc;
}
return 0;
}
int disk_tool_t::resize_copy_data()
{
if (iodepth <= 0 || iodepth > 4096)
{
iodepth = 32;
}
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
if (data_fd < 0)
{
fprintf(stderr, "Failed to open data device %s: %s\n", data_device.c_str(), strerror(errno));
delete ringloop;
return 1;
}
moving_blocks = new resizer_data_moving_t[iodepth];
moving_blocks[0].buf = memalign_or_die(MEM_ALIGNMENT, iodepth*data_block_size);
for (int i = 1; i < iodepth; i++)
{
moving_blocks[i].buf = moving_blocks[0].buf + i*data_block_size;
}
remap_active = 1;
remap_it = data_remap.begin();
ring_consumer.loop = [this]()
{
remap_active = 0;
for (int i = 0; i < iodepth; i++)
{
if (moving_blocks[i].state == DM_ST_EMPTY && remap_it != data_remap.end())
{
uint64_t old_loc = remap_it->first, new_loc = remap_it->second;
moving_blocks[i].state = DM_ST_TO_READ;
moving_blocks[i].old_loc = old_loc;
moving_blocks[i].new_loc = new_loc;
remap_it++;
}
if (moving_blocks[i].state == DM_ST_TO_READ)
{
struct io_uring_sqe *sqe = ringloop->get_sqe();
if (sqe)
{
moving_blocks[i].state = DM_ST_READING;
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ moving_blocks[i].buf, data_block_size };
my_uring_prep_readv(sqe, data_fd, &data->iov, 1, data_offset + moving_blocks[i].old_loc*data_block_size);
data->callback = [this, i](ring_data_t *data)
{
if (data->res != data_block_size)
{
fprintf(
stderr, "Failed to read %lu bytes at %lu from %s: %s\n", data_block_size,
data_offset + moving_blocks[i].old_loc*data_block_size, data_device.c_str(),
data->res < 0 ? strerror(-data->res) : "short read"
);
exit(1);
}
moving_blocks[i].state = DM_ST_TO_WRITE;
ringloop->wakeup();
};
}
}
if (moving_blocks[i].state == DM_ST_TO_WRITE)
{
struct io_uring_sqe *sqe = ringloop->get_sqe();
if (sqe)
{
moving_blocks[i].state = DM_ST_WRITING;
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ moving_blocks[i].buf, data_block_size };
my_uring_prep_writev(sqe, data_fd, &data->iov, 1, data_offset + moving_blocks[i].new_loc*data_block_size);
data->callback = [this, i](ring_data_t *data)
{
if (data->res != data_block_size)
{
fprintf(
stderr, "Failed to write %lu bytes at %lu to %s: %s\n", data_block_size,
data_offset + moving_blocks[i].new_loc*data_block_size, data_device.c_str(),
data->res < 0 ? strerror(-data->res) : "short write"
);
exit(1);
}
moving_blocks[i].state = DM_ST_EMPTY;
ringloop->wakeup();
};
}
}
remap_active += moving_blocks[i].state != DM_ST_EMPTY ? 1 : 0;
}
ringloop->submit();
};
ringloop->register_consumer(&ring_consumer);
while (1)
{
ringloop->loop();
if (!remap_active)
break;
ringloop->wait();
}
ringloop->unregister_consumer(&ring_consumer);
free(moving_blocks[0].buf);
delete[] moving_blocks;
close(data_fd);
delete ringloop;
return 0;
}
int disk_tool_t::resize_rewrite_journal()
{
// Simply overwriting on the fly may be impossible because old and new areas may overlap
// For now, just build new journal data in memory
new_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_journal_len);
new_journal_ptr = new_buf;
new_journal_data = new_journal_ptr + journal_block_size;
memset(new_buf, 0, new_journal_len);
process_journal([this](void *buf)
{
return process_journal_block(buf, [this](int num, journal_entry *je)
{
journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
if (je->type == JE_START)
{
*((journal_entry_start*)ne) = (journal_entry_start){
.magic = JOURNAL_MAGIC,
.type = JE_START,
.size = sizeof(ne->start),
.journal_start = journal_block_size,
.version = JOURNAL_VERSION,
};
ne->crc32 = je_crc32(ne);
new_journal_ptr += journal_block_size;
}
else
{
if (journal_block_size < new_journal_in_pos+je->size)
{
new_journal_ptr = new_journal_data;
if (new_journal_ptr-new_buf >= new_journal_len)
{
fprintf(stderr, "Error: live entries don't fit to the new journal\n");
exit(1);
}
new_journal_data += journal_block_size;
new_journal_in_pos = 0;
if (journal_block_size < je->size)
{
fprintf(stderr, "Error: journal entry too large (%u bytes)\n", je->size);
exit(1);
}
}
memcpy(ne, je, je->size);
ne->crc32_prev = new_crc32_prev;
if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
// Change the block reference
auto remap_it = data_remap.find(ne->big_write.location / data_block_size);
if (remap_it != data_remap.end())
{
ne->big_write.location = remap_it->second * data_block_size;
}
ne->big_write.location += data_idx_diff;
}
else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
ne->small_write.data_offset = new_journal_data-new_buf;
if (ne->small_write.data_offset + ne->small_write.len > new_journal_len)
{
fprintf(stderr, "Error: live entries don't fit to the new journal\n");
exit(1);
}
memcpy(new_journal_data, small_write_data, ne->small_write.len);
new_journal_data += ne->small_write.len;
}
ne->crc32 = je_crc32(ne);
new_journal_in_pos += ne->size;
new_crc32_prev = ne->crc32;
}
});
});
// FIXME: Write new journal and metadata with journaling if they overlap with old
new_journal_fd = open(new_journal_device, O_DIRECT|O_RDWR);
if (new_journal_fd < 0)
{
fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device, strerror(errno));
return 1;
}
lseek64(new_journal_fd, new_journal_offset, 0);
write_blocking(new_journal_fd, new_buf, new_journal_len);
fsync(new_journal_fd);
close(new_journal_fd);
free(new_buf);
return 0;
}
int disk_tool_t::resize_rewrite_meta()
{
new_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_meta_len);
memset(new_buf, 0, new_meta_len);
int r = process_meta(
[this](blockstore_meta_header_v1_t *hdr)
{
blockstore_meta_header_v1_t *new_hdr = (blockstore_meta_header_v1_t *)new_buf;
new_hdr->zero = 0;
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
new_hdr->version = BLOCKSTORE_META_VERSION_V1;
new_hdr->meta_block_size = meta_block_size;
new_hdr->data_block_size = data_block_size;
new_hdr->bitmap_granularity = bitmap_granularity ? bitmap_granularity : 4096;
},
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
auto remap_it = data_remap.find(block_num);
if (remap_it != data_remap.end())
block_num = remap_it->second;
if (block_num < free_first || block_num >= total_blocks-free_last)
return;
block_num += data_idx_diff;
clean_disk_entry *new_entry = (clean_disk_entry*)(new_buf + meta_block_size +
meta_block_size*(block_num / new_entries_per_block) +
new_clean_entry_size*(block_num % new_entries_per_block));
new_entry->oid = entry->oid;
new_entry->version = entry->version;
if (bitmap)
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size);
else
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
}
);
if (r != 0)
{
free(new_buf);
return r;
}
new_meta_fd = open(new_meta_device, O_DIRECT|O_RDWR);
if (new_meta_fd < 0)
{
fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device, strerror(errno));
return 1;
}
lseek64(new_meta_fd, new_meta_offset, 0);
write_blocking(new_meta_fd, new_buf, new_meta_len);
fsync(new_meta_fd);
close(new_meta_fd);
free(new_buf);
return 0;
}

224
src/dump_journal.cpp Normal file
View File

@ -0,0 +1,224 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <malloc.h>
#include <linux/fs.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <stdio.h>
#include "blockstore_impl.h"
#include "crc32c.h"
struct journal_dump_t
{
char *journal_device;
uint32_t journal_block;
uint64_t journal_offset;
uint64_t journal_len;
uint64_t journal_pos;
bool all;
bool started;
int fd;
uint32_t crc32_last;
int dump_block(void *buf);
};
int main(int argc, char *argv[])
{
journal_dump_t self = { 0 };
int b = 1;
if (argc >= 2 && !strcmp(argv[1], "--all"))
{
self.all = true;
b = 2;
}
if (argc < b+4)
{
printf("USAGE: %s [--all] <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
return 1;
}
self.journal_device = argv[b];
self.journal_block = strtoul(argv[b+1], NULL, 10);
self.journal_offset = strtoull(argv[b+2], NULL, 10);
self.journal_len = strtoull(argv[b+3], NULL, 10);
if (self.journal_block < DIRECT_IO_ALIGNMENT || (self.journal_block % DIRECT_IO_ALIGNMENT) ||
self.journal_block > 128*1024)
{
printf("Invalid journal block size\n");
return 1;
}
self.fd = open(self.journal_device, O_DIRECT|O_RDONLY);
if (self.fd == -1)
{
printf("Failed to open journal\n");
return 1;
}
void *data = memalign(MEM_ALIGNMENT, self.journal_block);
self.journal_pos = 0;
if (self.all)
{
while (self.journal_pos < self.journal_len)
{
int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
assert(r == self.journal_block);
uint64_t s;
for (s = 0; s < self.journal_block; s += 8)
{
if (*((uint64_t*)((uint8_t*)data+s)) != 0)
break;
}
if (s == self.journal_block)
{
printf("offset %08lx: zeroes\n", self.journal_pos);
self.journal_pos += self.journal_block;
}
else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
{
printf("offset %08lx:\n", self.journal_pos);
self.dump_block(data);
}
else
{
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
self.journal_pos += self.journal_block;
}
}
}
else
{
int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
assert(r == self.journal_block);
journal_entry *je = (journal_entry*)(data);
if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
{
printf("offset %08lx: journal superblock is invalid\n", self.journal_pos);
}
else
{
printf("offset %08lx:\n", self.journal_pos);
self.dump_block(data);
self.started = false;
self.journal_pos = je->start.journal_start;
while (1)
{
if (self.journal_pos >= self.journal_len)
self.journal_pos = self.journal_block;
r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
assert(r == self.journal_block);
printf("offset %08lx:\n", self.journal_pos);
r = self.dump_block(data);
if (r <= 0)
{
printf("end of the journal\n");
break;
}
}
}
}
free(data);
close(self.fd);
return 0;
}
int journal_dump_t::dump_block(void *buf)
{
uint32_t pos = 0;
journal_pos += journal_block;
int entry = 0;
bool wrapped = false;
while (pos < journal_block)
{
journal_entry *je = (journal_entry*)((uint8_t*)buf + pos);
if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
!all && started && je->crc32_prev != crc32_last)
{
break;
}
bool crc32_valid = je_crc32(je) == je->crc32;
if (!all && !crc32_valid)
{
break;
}
started = true;
crc32_last = je->crc32;
printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, (crc32_valid ? "(valid)" : "(invalid)"), je->crc32_prev);
if (je->type == JE_START)
{
printf("je_start start=%08lx\n", je->start.journal_start);
}
else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
printf(
"je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe,
je->small_write.version, je->small_write.offset, je->small_write.len,
je->small_write.data_offset
);
if (journal_pos + je->small_write.len > journal_len)
{
// data continues from the beginning of the journal
journal_pos = journal_block;
wrapped = true;
}
if (journal_pos != je->small_write.data_offset)
{
printf(" (mismatched, calculated = %lu)", journal_pos);
}
journal_pos += je->small_write.len;
if (journal_pos >= journal_len)
{
journal_pos = journal_block;
wrapped = true;
}
uint32_t data_crc32 = 0;
void *data = memalign(MEM_ALIGNMENT, je->small_write.len);
assert(pread(fd, data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
data_crc32 = crc32c(0, data, je->small_write.len);
free(data);
printf(
" data_crc32=%08x%s", je->small_write.crc32_data,
(data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
);
printf("\n");
}
else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
printf(
"je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n",
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
);
}
else if (je->type == JE_STABLE)
{
printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
}
else if (je->type == JE_ROLLBACK)
{
printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
}
else if (je->type == JE_DELETE)
{
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
}
pos += je->size;
entry++;
}
if (wrapped)
{
journal_pos = journal_len;
}
return entry;
}

169
src/dump_meta.cpp Normal file
View File

@ -0,0 +1,169 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <malloc.h>
#include <errno.h>
#include <assert.h>
#include <stdio.h>
#include "blockstore_impl.h"
#include "osd_id.h"
#include "rw_blocking.h"
struct meta_dumper_t
{
char *meta_device;
uint32_t meta_block_size;
uint64_t meta_offset;
uint64_t meta_len;
uint64_t meta_pos;
int fd;
int dump();
};
int main(int argc, char *argv[])
{
meta_dumper_t self = { 0 };
int b = 1;
if (argc < b+4)
{
printf("USAGE: %s <meta_file> <meta_block_size> <offset> <size>\n", argv[0]);
return 1;
}
self.meta_device = argv[b];
self.meta_block_size = strtoul(argv[b+1], NULL, 10);
self.meta_offset = strtoull(argv[b+2], NULL, 10);
self.meta_len = strtoull(argv[b+3], NULL, 10);
return self.dump();
}
int meta_dumper_t::dump()
{
if (this->meta_block_size % DIRECT_IO_ALIGNMENT)
{
printf("Invalid metadata block size\n");
return 1;
}
this->fd = open(this->meta_device, O_DIRECT|O_RDONLY);
if (this->fd == -1)
{
printf("Failed to open metadata device\n");
return 1;
}
int buf_size = 1024*1024;
if (buf_size % this->meta_block_size)
buf_size = 8*this->meta_block_size;
if (buf_size > this->meta_len)
buf_size = this->meta_len;
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
lseek64(this->fd, this->meta_offset, 0);
read_blocking(this->fd, data, buf_size);
// Check superblock
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
if (hdr->zero == 0 &&
hdr->magic == BLOCKSTORE_META_MAGIC_V1 &&
hdr->version == BLOCKSTORE_META_VERSION_V1)
{
// Vitastor 0.6-0.7 - static array of clean_disk_entry with bitmaps
if (hdr->meta_block_size != this->meta_block_size)
{
printf("Using block size of %u bytes based on information from the superblock\n", hdr->meta_block_size);
this->meta_block_size = hdr->meta_block_size;
if (buf_size % this->meta_block_size)
{
buf_size = 8*this->meta_block_size;
free(data);
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
}
}
this->meta_offset += this->meta_block_size;
this->meta_len -= this->meta_block_size;
uint64_t clean_entry_bitmap_size = hdr->data_block_size / hdr->bitmap_granularity / 8;
uint64_t clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
uint64_t block_num = 0;
printf(
"{\"version\":\"0.6\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity
);
bool first = true;
lseek64(this->fd, this->meta_offset, 0);
while (this->meta_pos < this->meta_len)
{
uint64_t read_len = buf_size < this->meta_len-this->meta_pos ? buf_size : this->meta_len-this->meta_pos;
read_blocking(this->fd, data, read_len);
this->meta_pos += read_len;
for (uint64_t blk = 0; blk < read_len; blk += this->meta_block_size)
{
for (uint64_t ioff = 0; ioff < this->meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry *entry = (clean_disk_entry*)(data + blk + ioff);
if (entry->oid.inode)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu,\"bitmap\":\""
(first ? ENTRY_FMT : (",\n" ENTRY_FMT)),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
first = false;
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", entry->bitmap[i]);
}
printf("\",\"ext_bitmap\":\"");
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", entry->bitmap[clean_entry_bitmap_size + i]);
}
printf("\"}");
}
}
}
}
printf("\n]}\n");
}
else
{
// Vitastor 0.4-0.5 - static array of clean_disk_entry
uint64_t clean_entry_size = sizeof(clean_disk_entry);
uint64_t block_num = 0;
printf("{\"version\":\"0.5\",\"meta_block_size\":%u,\"entries\":[\n", this->meta_block_size);
bool first = true;
while (this->meta_pos < this->meta_len)
{
uint64_t read_len = buf_size < this->meta_len-this->meta_pos ? buf_size : this->meta_len-this->meta_pos;
read_blocking(this->fd, data, read_len);
this->meta_pos += read_len;
for (uint64_t blk = 0; blk < read_len; blk += this->meta_block_size)
{
for (uint64_t ioff = 0; ioff < this->meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry *entry = (clean_disk_entry*)(data + blk + ioff);
if (entry->oid.inode)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu}"
(first ? ENTRY_FMT : (",\n" ENTRY_FMT)),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
first = false;
}
}
}
}
printf("\n]}\n");
}
free(data);
close(this->fd);
return 0;
}