// Copyright (c) Vitaliy Filippov, 2019+ // License: VNPL-1.1 (see README.md for details) #define _LARGEFILE64_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include "json11/json11.hpp" #include "blockstore_impl.h" #include "blockstore_disk.h" #include "osd_id.h" #include "crc32c.h" #include "rw_blocking.h" #define DM_ST_EMPTY 0 #define DM_ST_TO_READ 1 #define DM_ST_READING 2 #define DM_ST_TO_WRITE 3 #define DM_ST_WRITING 4 // vITADisk #define VITASTOR_DISK_MAGIC 0x6b73694441544976 #define VITASTOR_DISK_MAX_SB_SIZE 128*1024 struct __attribute__((__packed__)) vitastor_disk_superblock_t { uint64_t magic; uint32_t crc32c; uint32_t size; uint8_t json_data[]; }; struct resizer_data_moving_t { int state = 0; void *buf = NULL; uint64_t old_loc, new_loc; }; struct disk_tool_t { /**** Parameters ****/ std::map options; bool all; bool json; blockstore_disk_t dsk; // resize data and/or move metadata and journal int iodepth; std::string new_meta_device, new_journal_device; uint64_t new_data_offset, new_data_len; uint64_t new_journal_offset, new_journal_len; uint64_t new_meta_offset, new_meta_len; /**** State ****/ uint64_t meta_pos; uint64_t journal_pos, journal_calc_data_pos; bool first, first2; allocator *data_alloc; std::map data_remap; std::map::iterator remap_it; ring_loop_t *ringloop; ring_consumer_t ring_consumer; int remap_active; uint8_t *new_journal_buf, *new_meta_buf, *new_journal_ptr, *new_journal_data; uint64_t new_journal_in_pos; int64_t data_idx_diff; uint64_t total_blocks, free_first, free_last; uint64_t new_clean_entry_bitmap_size, new_clean_entry_size, new_entries_per_block; int new_journal_fd, new_meta_fd; resizer_data_moving_t *moving_blocks; bool started; void *small_write_data; uint32_t data_crc32; uint32_t crc32_last; uint32_t new_crc32_prev; /**** Commands ****/ int dump_journal(); int dump_meta(); int resize_data(); /**** Methods ****/ ~disk_tool_t(); void dump_journal_entry(int num, journal_entry *je, bool json); int process_journal(std::function block_fn); int process_journal_block(void *buf, std::function iter_fn); int process_meta(std::function hdr_fn, std::function record_fn); void dump_meta_header(blockstore_meta_header_v1_t *hdr); void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap); int resize_parse_params(); void resize_init(blockstore_meta_header_v1_t *hdr); int resize_remap_blocks(); int resize_copy_data(); int resize_rewrite_journal(); int resize_write_new_journal(); int resize_rewrite_meta(); int resize_write_new_meta(); int udev_import(std::string device); int write_sb(std::string device); int exec_osd(std::string device); int start_osd(std::string device); int stop_osd(std::string device); json11::Json read_osd_superblock(std::string device, std::string & device_type); uint32_t write_osd_superblock(std::string device, json11::Json params); }; void disk_tool_simple_offsets(json11::Json cfg, bool json_output); int main(int argc, char *argv[]) { disk_tool_t self = {}; std::vector cmd; char *exe_name = strrchr(argv[0], '/'); exe_name = exe_name ? exe_name+1 : argv[0]; bool aliased = false; if (!strcmp(exe_name, "vitastor-dump-journal")) { cmd.push_back((char*)"dump-journal"); aliased = true; } for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--all")) { self.all = true; } else if (!strcmp(argv[i], "--json")) { self.json = true; } else if (!strcmp(argv[i], "--help")) { cmd.clear(); cmd.push_back((char*)"help"); } else if (!strcmp(argv[i], "--force")) { self.options["force"] = "1"; } else if (argv[i][0] == '-' && argv[i][1] == '-') { char *key = argv[i]+2; self.options[key] = argv[++i]; } else { cmd.push_back(argv[i]); } } if (cmd.size() && !strcmp(cmd[0], "dump-journal")) { if (cmd.size() < 5) { fprintf(stderr, "USAGE: %s%s [--all] [--json] \n", argv[0], aliased ? "" : " dump-journal"); return 1; } self.dsk.journal_device = cmd[1]; self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10); self.dsk.journal_offset = strtoull(cmd[3], NULL, 10); self.dsk.journal_len = strtoull(cmd[4], NULL, 10); return self.dump_journal(); } else if (cmd.size() && !strcmp(cmd[0], "dump-meta")) { if (cmd.size() < 5) { fprintf(stderr, "USAGE: %s dump-meta \n", argv[0]); return 1; } self.dsk.meta_device = cmd[1]; self.dsk.meta_block_size = strtoul(cmd[2], NULL, 10); self.dsk.meta_offset = strtoull(cmd[3], NULL, 10); self.dsk.meta_len = strtoull(cmd[4], NULL, 10); return self.dump_meta(); } else if (cmd.size() && !strcmp(cmd[0], "resize")) { return self.resize_data(); } else if (cmd.size() && !strcmp(cmd[0], "simple-offsets")) { // Calculate offsets for simple & stupid OSD deployment without superblock if (cmd.size() > 1) { self.options["device"] = cmd[1]; } disk_tool_simple_offsets(self.options, self.json); return 0; } else if (cmd.size() && !strcmp(cmd[0], "udev")) { if (cmd.size() != 2) { fprintf(stderr, "Exactly 1 device path argument is required\n"); return 1; } return self.udev_import(cmd[1]); } else if (cmd.size() && !strcmp(cmd[0], "write-sb")) { if (cmd.size() != 2) { fprintf(stderr, "Exactly 1 device path argument is required\n"); return 1; } return self.write_sb(cmd[1]); } else if (cmd.size() && !strcmp(cmd[0], "start")) { if (cmd.size() == 1) { fprintf(stderr, "Device path is missing\n"); return 1; } int res = 0; for (int i = 1; i < cmd.size(); i++) { int r = self.start_osd(cmd[i]); if (r) res = r; } return res; } else if (cmd.size() && !strcmp(cmd[0], "exec-osd")) { if (cmd.size() != 2) { fprintf(stderr, "Exactly 1 device path argument is required\n"); return 1; } return self.exec_osd(cmd[1]); } else { printf( "Vitastor disk management tool\n" "(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n" "\n" "USAGE:\n" "%s dump-journal [--all] [--json] \n" " Dump journal in human-readable or JSON (if --json is specified) format.\n" " Without --all, only actual part of the journal is dumped.\n" " With --all, the whole journal area is scanned for journal entries,\n" " some of which may be outdated.\n" "\n" "%s dump-meta \n" " Dump metadata in JSON format.\n" "\n" "%s resize [--iodepth 32]\n" " Resize data area and/or rewrite/move journal and metadata\n" " ALL_OSD_PARAMETERS must include all (at least all disk-related)\n" " parameters from OSD command line (i.e. from systemd unit).\n" " NEW_PARAMETERS include new disk layout parameters:\n" " [--new_data_offset ] resize data area so it starts at \n" " [--new_data_len ] resize data area to bytes\n" " [--new_meta_device ] use for new metadata\n" " [--new_meta_offset ] make new metadata area start at \n" " [--new_meta_len ] make new metadata area bytes long\n" " [--new_journal_device ] use for new journal\n" " [--new_journal_offset ] make new journal area start at \n" " [--new_journal_len ] make new journal area bytes long\n" " If any of the new layout parameter options are not specified, old values\n" " will be used.\n" "\n" "%s simple-offsets \n" " Calculate offsets for simple&stupid (no superblock) OSD deployment. Options:\n" " --object_size 128k Set blockstore block size\n" " --bitmap_granularity 4k Set bitmap granularity\n" " --journal_size 16M Set journal size\n" " --device_block_size 4k Set device block size\n" " --journal_offset 0 Set journal offset\n" " --device_size 0 Set device size\n" " --format text Result format: json, options, env, or text\n" "\n" "%s udev \n" " Try to read Vitastor OSD superblock from and print variables for udev.\n" "\n" "%s exec-osd \n" " Read Vitastor OSD superblock from and start the OSD with parameters from it.\n" "\n" "%s start \n" "%s restart \n" " Configure systemd unit and start Vitastor OSD on which should be\n" " a GPT Vitastor partition. Restart stops the corresponding OSD, reconfigures\n" " and restarts it.\n" "\n" "%s stop \n" " Stop Vitastor OSD corresponding to device using systemd.\n" , argv[0], argv[0], argv[0], argv[0], argv[0], argv[0], argv[0], argv[0], argv[0] ); } return 0; } disk_tool_t::~disk_tool_t() { if (data_alloc) { delete data_alloc; data_alloc = NULL; } } int disk_tool_t::dump_journal() { if (dsk.journal_block_size < DIRECT_IO_ALIGNMENT || (dsk.journal_block_size % DIRECT_IO_ALIGNMENT) || dsk.journal_block_size > 128*1024) { fprintf(stderr, "Invalid journal block size\n"); return 1; } first = true; if (json) printf("[\n"); if (all) { dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY); if (dsk.journal_fd < 0) { fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno)); return 1; } void *journal_buf = memalign_or_die(MEM_ALIGNMENT, dsk.journal_block_size); journal_pos = 0; while (journal_pos < dsk.journal_len) { int r = pread(dsk.journal_fd, journal_buf, dsk.journal_block_size, dsk.journal_offset+journal_pos); assert(r == dsk.journal_block_size); uint64_t s; for (s = 0; s < dsk.journal_block_size; s += 8) { if (*((uint64_t*)((uint8_t*)journal_buf+s)) != 0) break; } if (json) { printf("%s{\"offset\":\"0x%lx\"", first ? "" : ",\n", journal_pos); first = false; } if (s == dsk.journal_block_size) { if (json) printf(",\"type\":\"zero\"}"); else printf("offset %08lx: zeroes\n", journal_pos); journal_pos += dsk.journal_block_size; } else if (((journal_entry*)journal_buf)->magic == JOURNAL_MAGIC) { if (!json) printf("offset %08lx:\n", journal_pos); else printf(",\"entries\":[\n"); first2 = true; process_journal_block(journal_buf, [this](int num, journal_entry *je) { dump_journal_entry(num, je, json); }); if (json) printf(first2 ? "]}" : "\n]}"); } else { if (json) printf(",\"type\":\"data\",\"pattern\":\"%08lx\"}", *((uint64_t*)journal_buf)); else printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%08lx)\n", journal_pos, *((uint64_t*)journal_buf)); journal_pos += dsk.journal_block_size; } } free(journal_buf); close(dsk.journal_fd); dsk.journal_fd = -1; } else { process_journal([this](void *data) { first2 = true; if (!json) printf("offset %08lx:\n", journal_pos); auto pos = journal_pos; int r = process_journal_block(data, [this, pos](int num, journal_entry *je) { if (json && first2) { printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first ? "" : ",\n", pos); first = false; } dump_journal_entry(num, je, json); }); if (json) printf(first2 ? "" : "\n]}"); else if (r <= 0) printf("end of the journal\n"); return r; }); } if (json) printf(first ? "]\n" : "\n]\n"); return 0; } int disk_tool_t::process_journal(std::function block_fn) { dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY); if (dsk.journal_fd < 0) { fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno)); return 1; } void *data = memalign_or_die(MEM_ALIGNMENT, dsk.journal_block_size); journal_pos = 0; int r = pread(dsk.journal_fd, data, dsk.journal_block_size, dsk.journal_offset+journal_pos); assert(r == dsk.journal_block_size); journal_entry *je = (journal_entry*)(data); if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32) { fprintf(stderr, "offset %08lx: journal superblock is invalid\n", journal_pos); r = 1; } else { started = false; crc32_last = 0; block_fn(data); started = false; crc32_last = 0; journal_pos = je->start.journal_start; while (1) { if (journal_pos >= dsk.journal_len) journal_pos = dsk.journal_block_size; r = pread(dsk.journal_fd, data, dsk.journal_block_size, dsk.journal_offset+journal_pos); assert(r == dsk.journal_block_size); r = block_fn(data); if (r <= 0) break; } } close(dsk.journal_fd); dsk.journal_fd = -1; free(data); return r; } int disk_tool_t::process_journal_block(void *buf, std::function iter_fn) { uint32_t pos = 0; journal_pos += dsk.journal_block_size; int entry = 0; bool wrapped = false; while (pos <= dsk.journal_block_size-JOURNAL_ENTRY_HEADER_SIZE) { journal_entry *je = (journal_entry*)((uint8_t*)buf + pos); if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX || !all && started && je->crc32_prev != crc32_last || pos > dsk.journal_block_size-je->size) { break; } bool crc32_valid = je_crc32(je) == je->crc32; if (!all && !crc32_valid) { break; } started = true; crc32_last = je->crc32; if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT) { journal_calc_data_pos = journal_pos; if (journal_pos + je->small_write.len > dsk.journal_len) { // data continues from the beginning of the journal journal_calc_data_pos = journal_pos = dsk.journal_block_size; wrapped = true; } journal_pos += je->small_write.len; if (journal_pos >= dsk.journal_len) { journal_pos = dsk.journal_block_size; wrapped = true; } small_write_data = memalign_or_die(MEM_ALIGNMENT, je->small_write.len); assert(pread(dsk.journal_fd, small_write_data, je->small_write.len, dsk.journal_offset+je->small_write.data_offset) == je->small_write.len); data_crc32 = crc32c(0, small_write_data, je->small_write.len); } iter_fn(entry, je); if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT) { free(small_write_data); small_write_data = NULL; } pos += je->size; entry++; } if (wrapped) { journal_pos = dsk.journal_len; } return entry; } void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json) { if (json) { if (!first2) printf(",\n"); first2 = false; printf( "{\"crc32\":\"%08x\",\"valid\":%s,\"crc32_prev\":\"%08x\"", je->crc32, (je_crc32(je) == je->crc32 ? "true" : "false"), je->crc32_prev ); } else { printf( "entry % 3d: crc32=%08x %s prev=%08x ", num, je->crc32, (je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)"), je->crc32_prev ); } if (je->type == JE_START) { printf( json ? ",\"type\":\"start\",\"start\":\"0x%lx\"}" : "je_start start=%08lx\n", je->start.journal_start ); } else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT) { printf( json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\"" : "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx", je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "", je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version, je->small_write.offset, je->small_write.len, je->small_write.data_offset ); if (journal_calc_data_pos != je->small_write.data_offset) { printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\"" : " (mismatched, calculated = %lu)", journal_pos); } printf( json ? ",\"data_crc32\":\"%08x\",\"data_valid\":%s}" : " data_crc32=%08x%s\n", je->small_write.crc32_data, (data_crc32 != je->small_write.crc32_data ? (json ? "false" : " (invalid)") : (json ? "true" : " (valid)")) ); } else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT) { printf( json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"loc\":\"0x%lx\"}" : "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n", je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location ); } else if (je->type == JE_STABLE) { printf( json ? ",\"type\":\"stable\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}" : "je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version ); } else if (je->type == JE_ROLLBACK) { printf( json ? ",\"type\":\"rollback\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}" : "je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version ); } else if (je->type == JE_DELETE) { printf( json ? ",\"type\":\"delete\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}" : "je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version ); } } int disk_tool_t::process_meta(std::function hdr_fn, std::function record_fn) { if (dsk.meta_block_size % DIRECT_IO_ALIGNMENT) { fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT); return 1; } dsk.meta_fd = open(dsk.meta_device.c_str(), O_DIRECT|O_RDONLY); if (dsk.meta_fd < 0) { fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno)); return 1; } int buf_size = 1024*1024; if (buf_size % dsk.meta_block_size) buf_size = 8*dsk.meta_block_size; if (buf_size > dsk.meta_len) buf_size = dsk.meta_len; void *data = memalign_or_die(MEM_ALIGNMENT, buf_size); lseek64(dsk.meta_fd, dsk.meta_offset, 0); read_blocking(dsk.meta_fd, data, buf_size); // Check superblock blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data; if (hdr->zero == 0 && hdr->magic == BLOCKSTORE_META_MAGIC_V1 && hdr->version == BLOCKSTORE_META_VERSION_V1) { // Vitastor 0.6-0.7 - static array of clean_disk_entry with bitmaps if (hdr->meta_block_size != dsk.meta_block_size) { fprintf(stderr, "Using block size of %u bytes based on information from the superblock\n", hdr->meta_block_size); dsk.meta_block_size = hdr->meta_block_size; if (buf_size % dsk.meta_block_size) { buf_size = 8*dsk.meta_block_size; free(data); data = memalign_or_die(MEM_ALIGNMENT, buf_size); } } dsk.bitmap_granularity = hdr->bitmap_granularity; dsk.clean_entry_bitmap_size = hdr->data_block_size / hdr->bitmap_granularity / 8; dsk.clean_entry_size = sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size; uint64_t block_num = 0; hdr_fn(hdr); meta_pos = dsk.meta_block_size; lseek64(dsk.meta_fd, dsk.meta_offset+meta_pos, 0); while (meta_pos < dsk.meta_len) { uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos; read_blocking(dsk.meta_fd, data, read_len); meta_pos += read_len; for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size) { for (uint64_t ioff = 0; ioff <= dsk.meta_block_size-dsk.clean_entry_size; ioff += dsk.clean_entry_size, block_num++) { clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)data + blk + ioff); if (entry->oid.inode) { record_fn(block_num, entry, entry->bitmap); } } } } } else { // Vitastor 0.4-0.5 - static array of clean_disk_entry dsk.clean_entry_bitmap_size = 0; dsk.clean_entry_size = sizeof(clean_disk_entry); uint64_t block_num = 0; hdr_fn(NULL); while (meta_pos < dsk.meta_len) { uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos; read_blocking(dsk.meta_fd, data, read_len); meta_pos += read_len; for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size) { for (uint64_t ioff = 0; ioff < dsk.meta_block_size-dsk.clean_entry_size; ioff += dsk.clean_entry_size, block_num++) { clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)data + blk + ioff); if (entry->oid.inode) { record_fn(block_num, entry, NULL); } } } } } free(data); close(dsk.meta_fd); dsk.meta_fd = -1; return 0; } int disk_tool_t::dump_meta() { int r = process_meta( [this](blockstore_meta_header_v1_t *hdr) { dump_meta_header(hdr); }, [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { dump_meta_entry(block_num, entry, bitmap); } ); printf("\n]}\n"); return r; } void disk_tool_t::dump_meta_header(blockstore_meta_header_v1_t *hdr) { if (hdr) { printf( "{\"version\":\"0.6\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,\"entries\":[\n", hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity ); } else { printf("{\"version\":\"0.5\",\"meta_block_size\":%lu,\"entries\":[\n", dsk.meta_block_size); } first = true; } void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { printf( #define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu" (first ? ENTRY_FMT : (",\n" ENTRY_FMT)), #undef ENTRY_FMT block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode), entry->oid.stripe, entry->version ); if (bitmap) { printf(",\"bitmap\":\""); for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++) { printf("%02x", bitmap[i]); } printf("\",\"ext_bitmap\":\""); for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++) { printf("%02x", bitmap[dsk.clean_entry_bitmap_size + i]); } printf("\"}"); } else { printf("}"); } first = false; } int disk_tool_t::resize_data() { int r; // Parse parameters r = resize_parse_params(); if (r != 0) return r; // Check parameters and fill allocator fprintf(stderr, "Reading metadata\n"); data_alloc = new allocator((new_data_len < dsk.data_len ? dsk.data_len : new_data_len) / dsk.data_block_size); r = process_meta( [this](blockstore_meta_header_v1_t *hdr) { resize_init(hdr); }, [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { data_alloc->set(block_num, true); } ); if (r != 0) return r; fprintf(stderr, "Reading journal\n"); r = process_journal([this](void *buf) { return process_journal_block(buf, [this](int num, journal_entry *je) { if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT) { data_alloc->set(je->big_write.location / dsk.data_block_size, true); } }); }); if (r != 0) return r; // Remap blocks r = resize_remap_blocks(); if (r != 0) return r; // Copy data blocks into new places fprintf(stderr, "Moving data blocks\n"); r = resize_copy_data(); if (r != 0) return r; // Rewrite journal fprintf(stderr, "Rebuilding journal\n"); r = resize_rewrite_journal(); if (r != 0) return r; // Rewrite metadata fprintf(stderr, "Rebuilding metadata\n"); r = resize_rewrite_meta(); if (r != 0) return r; // Write new journal fprintf(stderr, "Writing new journal\n"); r = resize_write_new_journal(); if (r != 0) return r; // Write new metadata fprintf(stderr, "Writing new metadata\n"); r = resize_write_new_meta(); if (r != 0) return r; fprintf(stderr, "Done\n"); return 0; } int disk_tool_t::resize_parse_params() { try { dsk.parse_config(options); dsk.open_data(); dsk.open_meta(); dsk.open_journal(); dsk.calc_lengths(); dsk.close_all(); } catch (std::exception & e) { dsk.close_all(); fprintf(stderr, "Error: %s\n", e.what()); return 1; } iodepth = strtoull(options["iodepth"].c_str(), NULL, 10); if (!iodepth) iodepth = 32; new_meta_device = options.find("new_meta_device") != options.end() ? options["new_meta_device"] : dsk.meta_device; new_journal_device = options.find("new_journal_device") != options.end() ? options["new_journal_device"] : dsk.journal_device; new_data_offset = options.find("new_data_offset") != options.end() ? strtoull(options["new_data_offset"].c_str(), NULL, 10) : dsk.data_offset; new_data_len = options.find("new_data_len") != options.end() ? strtoull(options["new_data_len"].c_str(), NULL, 10) : dsk.data_len; new_meta_offset = options.find("new_meta_offset") != options.end() ? strtoull(options["new_meta_offset"].c_str(), NULL, 10) : dsk.meta_offset; new_meta_len = options.find("new_meta_len") != options.end() ? strtoull(options["new_meta_len"].c_str(), NULL, 10) : 0; // will be calculated in resize_init() new_journal_offset = options.find("new_journal_offset") != options.end() ? strtoull(options["new_journal_offset"].c_str(), NULL, 10) : dsk.journal_offset; new_journal_len = options.find("new_journal_len") != options.end() ? strtoull(options["new_journal_len"].c_str(), NULL, 10) : dsk.journal_len; if (new_meta_device == dsk.meta_device && new_journal_device == dsk.journal_device && new_data_offset == dsk.data_offset && new_data_len == dsk.data_len && new_meta_offset == dsk.meta_offset && (new_meta_len == dsk.meta_len || new_meta_len == 0) && new_journal_offset == dsk.journal_offset && new_journal_len == dsk.journal_len && options.find("force") == options.end()) { // No difference fprintf(stderr, "No difference, specify --force to rewrite journal and meta anyway\n"); return 1; } return 0; } void disk_tool_t::resize_init(blockstore_meta_header_v1_t *hdr) { if (hdr && dsk.data_block_size != hdr->data_block_size) { if (dsk.data_block_size) { fprintf(stderr, "Using data block size of %u bytes from metadata superblock\n", hdr->data_block_size); } dsk.data_block_size = hdr->data_block_size; } if (((new_data_len-dsk.data_len) % dsk.data_block_size) || ((new_data_offset-dsk.data_offset) % dsk.data_block_size)) { fprintf(stderr, "Data alignment mismatch\n"); exit(1); } data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size; free_first = new_data_offset > dsk.data_offset ? (new_data_offset-dsk.data_offset) / dsk.data_block_size : 0; free_last = (new_data_offset+new_data_len < dsk.data_offset+dsk.data_len) ? (dsk.data_offset+dsk.data_len-new_data_offset-new_data_len) / dsk.data_block_size : 0; new_clean_entry_bitmap_size = dsk.data_block_size / (hdr ? hdr->bitmap_granularity : 4096) / 8; new_clean_entry_size = sizeof(clean_disk_entry) + 2 * new_clean_entry_bitmap_size; new_entries_per_block = dsk.meta_block_size/new_clean_entry_size; uint64_t new_meta_blocks = 1 + (new_data_len/dsk.data_block_size + new_entries_per_block-1) / new_entries_per_block; if (!new_meta_len) { new_meta_len = dsk.meta_block_size*new_meta_blocks; } if (new_meta_len < dsk.meta_block_size*new_meta_blocks) { fprintf(stderr, "New metadata area size is too small, should be at least %lu bytes\n", dsk.meta_block_size*new_meta_blocks); exit(1); } // Check that new metadata, journal and data areas don't overlap if (new_meta_device == dsk.data_device && new_meta_offset < new_data_offset+new_data_len && new_meta_offset+new_meta_len > new_data_offset) { fprintf(stderr, "New metadata area overlaps with data\n"); exit(1); } if (new_journal_device == dsk.data_device && new_journal_offset < new_data_offset+new_data_len && new_journal_offset+new_journal_len > new_data_offset) { fprintf(stderr, "New journal area overlaps with data\n"); exit(1); } if (new_journal_device == new_meta_device && new_journal_offset < new_meta_offset+new_meta_len && new_journal_offset+new_journal_len > new_meta_offset) { fprintf(stderr, "New journal area overlaps with metadata\n"); exit(1); } } int disk_tool_t::resize_remap_blocks() { total_blocks = dsk.data_len / dsk.data_block_size; for (uint64_t i = 0; i < free_first; i++) { if (data_alloc->get(i)) data_remap[i] = 0; else data_alloc->set(i, true); } for (uint64_t i = 0; i < free_last; i++) { if (data_alloc->get(total_blocks-i)) data_remap[total_blocks-i] = 0; else data_alloc->set(total_blocks-i, true); } for (auto & p: data_remap) { uint64_t new_loc = data_alloc->find_free(); if (new_loc == UINT64_MAX) { fprintf(stderr, "Not enough space to move data\n"); return 1; } data_alloc->set(new_loc, true); data_remap[p.first] = new_loc; } return 0; } int disk_tool_t::resize_copy_data() { if (iodepth <= 0 || iodepth > 4096) { iodepth = 32; } ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth); dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR); if (dsk.data_fd < 0) { fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno)); delete ringloop; ringloop = NULL; return 1; } moving_blocks = new resizer_data_moving_t[iodepth]; moving_blocks[0].buf = memalign_or_die(MEM_ALIGNMENT, iodepth*dsk.data_block_size); for (int i = 1; i < iodepth; i++) { moving_blocks[i].buf = (uint8_t*)moving_blocks[0].buf + i*dsk.data_block_size; } remap_active = 1; remap_it = data_remap.begin(); ring_consumer.loop = [this]() { remap_active = 0; for (int i = 0; i < iodepth; i++) { if (moving_blocks[i].state == DM_ST_EMPTY && remap_it != data_remap.end()) { uint64_t old_loc = remap_it->first, new_loc = remap_it->second; moving_blocks[i].state = DM_ST_TO_READ; moving_blocks[i].old_loc = old_loc; moving_blocks[i].new_loc = new_loc; remap_it++; } if (moving_blocks[i].state == DM_ST_TO_READ) { struct io_uring_sqe *sqe = ringloop->get_sqe(); if (sqe) { moving_blocks[i].state = DM_ST_READING; struct ring_data_t *data = ((ring_data_t*)sqe->user_data); data->iov = (struct iovec){ moving_blocks[i].buf, dsk.data_block_size }; my_uring_prep_readv(sqe, dsk.data_fd, &data->iov, 1, dsk.data_offset + moving_blocks[i].old_loc*dsk.data_block_size); data->callback = [this, i](ring_data_t *data) { if (data->res != dsk.data_block_size) { fprintf( stderr, "Failed to read %u bytes at %lu from %s: %s\n", dsk.data_block_size, dsk.data_offset + moving_blocks[i].old_loc*dsk.data_block_size, dsk.data_device.c_str(), data->res < 0 ? strerror(-data->res) : "short read" ); exit(1); } moving_blocks[i].state = DM_ST_TO_WRITE; ringloop->wakeup(); }; } } if (moving_blocks[i].state == DM_ST_TO_WRITE) { struct io_uring_sqe *sqe = ringloop->get_sqe(); if (sqe) { moving_blocks[i].state = DM_ST_WRITING; struct ring_data_t *data = ((ring_data_t*)sqe->user_data); data->iov = (struct iovec){ moving_blocks[i].buf, dsk.data_block_size }; my_uring_prep_writev(sqe, dsk.data_fd, &data->iov, 1, dsk.data_offset + moving_blocks[i].new_loc*dsk.data_block_size); data->callback = [this, i](ring_data_t *data) { if (data->res != dsk.data_block_size) { fprintf( stderr, "Failed to write %u bytes at %lu to %s: %s\n", dsk.data_block_size, dsk.data_offset + moving_blocks[i].new_loc*dsk.data_block_size, dsk.data_device.c_str(), data->res < 0 ? strerror(-data->res) : "short write" ); exit(1); } moving_blocks[i].state = DM_ST_EMPTY; ringloop->wakeup(); }; } } remap_active += moving_blocks[i].state != DM_ST_EMPTY ? 1 : 0; } ringloop->submit(); }; ringloop->register_consumer(&ring_consumer); while (1) { ringloop->loop(); if (!remap_active) break; ringloop->wait(); } ringloop->unregister_consumer(&ring_consumer); free(moving_blocks[0].buf); delete[] moving_blocks; moving_blocks = NULL; close(dsk.data_fd); dsk.data_fd = -1; delete ringloop; ringloop = NULL; return 0; } int disk_tool_t::resize_rewrite_journal() { // Simply overwriting on the fly may be impossible because old and new areas may overlap // For now, just build new journal data in memory new_journal_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_journal_len); new_journal_ptr = new_journal_buf; new_journal_data = new_journal_ptr + dsk.journal_block_size; new_journal_in_pos = 0; memset(new_journal_buf, 0, new_journal_len); process_journal([this](void *buf) { return process_journal_block(buf, [this](int num, journal_entry *je) { if (je->type == JE_START) { journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos); *((journal_entry_start*)ne) = (journal_entry_start){ .magic = JOURNAL_MAGIC, .type = JE_START, .size = sizeof(journal_entry_start), .journal_start = dsk.journal_block_size, .version = JOURNAL_VERSION, }; ne->crc32 = je_crc32(ne); new_journal_ptr += dsk.journal_block_size; new_journal_data = new_journal_ptr+dsk.journal_block_size; new_journal_in_pos = 0; } else { if (dsk.journal_block_size < new_journal_in_pos+je->size) { new_journal_ptr = new_journal_data; if (new_journal_ptr-new_journal_buf >= new_journal_len) { fprintf(stderr, "Error: live entries don't fit to the new journal\n"); exit(1); } new_journal_data = new_journal_ptr+dsk.journal_block_size; new_journal_in_pos = 0; if (dsk.journal_block_size < je->size) { fprintf(stderr, "Error: journal entry too large (%u bytes)\n", je->size); exit(1); } } journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos); memcpy(ne, je, je->size); ne->crc32_prev = new_crc32_prev; if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT) { // Change the block reference auto remap_it = data_remap.find(ne->big_write.location / dsk.data_block_size); if (remap_it != data_remap.end()) { ne->big_write.location = remap_it->second * dsk.data_block_size; } ne->big_write.location += data_idx_diff * dsk.data_block_size; } else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT) { ne->small_write.data_offset = new_journal_data-new_journal_buf; if (ne->small_write.data_offset + ne->small_write.len > new_journal_len) { fprintf(stderr, "Error: live entries don't fit to the new journal\n"); exit(1); } memcpy(new_journal_data, small_write_data, ne->small_write.len); new_journal_data += ne->small_write.len; } ne->crc32 = je_crc32(ne); new_journal_in_pos += ne->size; new_crc32_prev = ne->crc32; } }); }); return 0; } int disk_tool_t::resize_write_new_journal() { new_journal_fd = open(new_journal_device.c_str(), O_DIRECT|O_RDWR); if (new_journal_fd < 0) { fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno)); return 1; } lseek64(new_journal_fd, new_journal_offset, 0); write_blocking(new_journal_fd, new_journal_buf, new_journal_len); fsync(new_journal_fd); close(new_journal_fd); new_journal_fd = -1; free(new_journal_buf); new_journal_buf = NULL; return 0; } int disk_tool_t::resize_rewrite_meta() { new_meta_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_meta_len); memset(new_meta_buf, 0, new_meta_len); int r = process_meta( [this](blockstore_meta_header_v1_t *hdr) { blockstore_meta_header_v1_t *new_hdr = (blockstore_meta_header_v1_t *)new_meta_buf; new_hdr->zero = 0; new_hdr->magic = BLOCKSTORE_META_MAGIC_V1; new_hdr->version = BLOCKSTORE_META_VERSION_V1; new_hdr->meta_block_size = dsk.meta_block_size; new_hdr->data_block_size = dsk.data_block_size; new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096; }, [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { auto remap_it = data_remap.find(block_num); if (remap_it != data_remap.end()) block_num = remap_it->second; if (block_num < free_first || block_num >= total_blocks-free_last) { fprintf(stderr, "BUG: remapped block not in range\n"); exit(1); } block_num += data_idx_diff; clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size + dsk.meta_block_size*(block_num / new_entries_per_block) + new_clean_entry_size*(block_num % new_entries_per_block)); new_entry->oid = entry->oid; new_entry->version = entry->version; if (bitmap) memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size); else memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size); } ); if (r != 0) { free(new_meta_buf); new_meta_buf = NULL; return r; } return 0; } int disk_tool_t::resize_write_new_meta() { new_meta_fd = open(new_meta_device.c_str(), O_DIRECT|O_RDWR); if (new_meta_fd < 0) { fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno)); return 1; } lseek64(new_meta_fd, new_meta_offset, 0); write_blocking(new_meta_fd, new_meta_buf, new_meta_len); fsync(new_meta_fd); close(new_meta_fd); new_meta_fd = -1; free(new_meta_buf); new_meta_buf = NULL; return 0; } static std::string udev_escape(std::string str) { std::string r; int p = str.find_first_of("\"\' \t\r\n"), prev = 0; if (p == std::string::npos) { return str; } while (p != std::string::npos) { r += str.substr(prev, p-prev); r += "\\"; prev = p; p = str.find_first_of("\"\' \t\r\n", p+1); } r += str.substr(prev); return r; } static std::string realpath_str(std::string path) { char *p = realpath((char*)path.c_str(), NULL); if (!p) { fprintf(stderr, "Failed to resolve %s: %s\n", path.c_str(), strerror(errno)); return path; } std::string rp(p); free(p); return rp; } int disk_tool_t::udev_import(std::string device) { std::string device_type; json11::Json osd_params = read_osd_superblock(device, device_type); if (osd_params.is_null()) { return 1; } uint64_t osd_num = osd_params["osd_num"].uint64_value(); // Print variables for udev printf("VITASTOR_OSD_NUM=%lu\n", osd_num); printf("VITASTOR_ALIAS=osd%lu%s\n", osd_num, device_type.c_str()); printf("VITASTOR_DATA_DEVICE=%s\n", udev_escape(osd_params["data_device"].string_value()).c_str()); if (osd_params["meta_device"].string_value() != "" && osd_params["meta_device"] != osd_params["data_device"]) printf("VITASTOR_META_DEVICE=%s\n", udev_escape(osd_params["meta_device"].string_value()).c_str()); if (osd_params["journal_device"].string_value() != "" && osd_params["journal_device"] != osd_params["data_device"]) printf("VITASTOR_JOURNAL_DEVICE=%s\n", udev_escape(osd_params["journal_device"].string_value()).c_str()); return 0; } int disk_tool_t::write_sb(std::string device) { std::string input; int r; char buf[4096]; while (1) { r = read(0, buf, sizeof(buf)); if (r <= 0 && errno != EAGAIN) break; input += std::string(buf, r); } std::string json_err; json11::Json params = json11::Json::parse(input, json_err); if (json_err != "" || !params["osd_num"].uint64_value() || params["data_device"].string_value() == "") { fprintf(stderr, "Invalid JSON input\n"); return 1; } return !write_osd_superblock(device, params); } uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json params) { std::string json_data = params.dump(); uint32_t sb_size = sizeof(vitastor_disk_superblock_t)+json_data.size(); if (sb_size > VITASTOR_DISK_MAX_SB_SIZE) { fprintf(stderr, "JSON data for superblock is too large\n"); return 0; } uint64_t buf_len = ((sb_size+4095)/4096) * 4096; uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, buf_len); memset(buf, 0, buf_len); vitastor_disk_superblock_t *sb = (vitastor_disk_superblock_t*)buf; sb->magic = VITASTOR_DISK_MAGIC; sb->size = sb_size; memcpy(sb->json_data, json_data.c_str(), json_data.size()); sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)); int fd = open(device.c_str(), O_DIRECT|O_RDWR); if (fd < 0) { fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno)); free(buf); return 0; } int r = write_blocking(fd, buf, buf_len); if (r < 0) { fprintf(stderr, "Failed to write to %s: %s\n", device.c_str(), strerror(errno)); close(fd); free(buf); return 0; } close(fd); free(buf); return sb_size; } json11::Json disk_tool_t::read_osd_superblock(std::string device, std::string & device_type) { vitastor_disk_superblock_t *sb = NULL; uint8_t *buf = NULL; json11::Json osd_params; std::string json_err; std::string real_device; int r, fd = open(device.c_str(), O_DIRECT|O_RDWR); if (fd < 0) { fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno)); return osd_params; } buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096); r = read_blocking(fd, buf, 4096); if (r != 4096) { fprintf(stderr, "Failed to read OSD superblock from %s: %s\n", device.c_str(), strerror(errno)); goto ex; } sb = (vitastor_disk_superblock_t*)buf; if (sb->magic != VITASTOR_DISK_MAGIC) { fprintf(stderr, "Invalid OSD superblock on %s: magic number mismatch\n", device.c_str()); goto ex; } if (sb->size > VITASTOR_DISK_MAX_SB_SIZE || // +2 is minimal json: {} sb->size < sizeof(vitastor_disk_superblock_t)+2) { fprintf(stderr, "Invalid OSD superblock on %s: invalid size\n", device.c_str()); goto ex; } if (sb->size > 4096) { uint64_t sb_size = ((sb->size+4095)/4096)*4096; free(buf); buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, sb_size); lseek64(fd, 0, 0); r = read_blocking(fd, buf, sb_size); if (r != sb_size) { fprintf(stderr, "Failed to read OSD superblock from %s: %s\n", device.c_str(), strerror(errno)); goto ex; } sb = (vitastor_disk_superblock_t*)buf; } if (sb->crc32c != crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf))) { fprintf(stderr, "Invalid OSD superblock on %s: crc32 mismatch\n", device.c_str()); goto ex; } osd_params = json11::Json::parse(std::string((char*)sb->json_data, sb->size - sizeof(vitastor_disk_superblock_t)), json_err); if (json_err != "") { fprintf(stderr, "Invalid OSD superblock on %s: invalid JSON\n", device.c_str()); goto ex; } // Validate superblock if (!osd_params["osd_num"].uint64_value()) { fprintf(stderr, "OSD superblock on %s lacks osd_num\n", device.c_str()); osd_params = json11::Json::object{}; goto ex; } if (osd_params["data_device"].string_value() == "") { fprintf(stderr, "OSD superblock on %s lacks data_device\n", device.c_str()); osd_params = json11::Json::object{}; goto ex; } real_device = realpath_str(device); if (real_device == realpath_str(osd_params["data_device"].string_value())) { device_type = "data"; } else if (osd_params["meta_device"] != "" && real_device == realpath_str(osd_params["meta_device"].string_value())) { device_type = "meta"; } else if (osd_params["journal_device"] != "" && real_device == realpath_str(osd_params["journal_device"].string_value())) { device_type = "journal"; } else { fprintf(stderr, "Invalid OSD superblock on %s: does not refer to the device itself\n", device.c_str()); osd_params = json11::Json::object{}; goto ex; } ex: free(buf); close(fd); return osd_params; } int disk_tool_t::start_osd(std::string device) { return 0; } int disk_tool_t::exec_osd(std::string device) { std::string device_type; json11::Json osd_params = read_osd_superblock(device, device_type); if (osd_params.is_null()) { return 1; } std::string osd_binary = "/usr/bin/vitastor-osd"; if (options["osd-binary"] != "") { osd_binary = options["osd-binary"]; } std::vector argstr; for (auto & kv: osd_params.object_items()) { argstr.push_back("--"+kv.first); argstr.push_back(kv.second.is_string() ? kv.second.string_value() : kv.second.dump()); } char *argv[argstr.size()+1]; for (int i = 0; i < argstr.size(); i++) { argv[i] = (char*)argstr[i].c_str(); } argv[argstr.size()] = 0; execve(osd_binary.c_str(), argv, environ); return 0; } int disk_tool_t::stop_osd(std::string device) { return 0; }