From fa31e68c7ff9600eef142d7a8c4e44d9591182e3 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 8 Mar 2024 14:35:54 +0300 Subject: [PATCH] Implement auto-unmount local NFS server mode for vitastor-nfs --- src/disk_tool.h | 2 - src/disk_tool_utils.cpp | 17 --- src/nfs_block.cpp | 22 ++- src/nfs_block.h | 4 +- src/nfs_kv.cpp | 69 +++++++++ src/nfs_kv.h | 10 ++ src/nfs_kv_create.cpp | 10 +- src/nfs_kv_read.cpp | 12 +- src/nfs_kv_readdir.cpp | 2 +- src/nfs_kv_remove.cpp | 4 +- src/nfs_kv_rename.cpp | 4 +- src/nfs_kv_setattr.cpp | 4 +- src/nfs_kv_write.cpp | 41 ++--- src/nfs_proxy.cpp | 329 ++++++++++++++++++++++++++-------------- src/nfs_proxy.h | 19 +-- src/str_util.cpp | 20 ++- src/str_util.h | 3 +- 17 files changed, 383 insertions(+), 189 deletions(-) diff --git a/src/disk_tool.h b/src/disk_tool.h index 6b4f2bc1..9e7a43c4 100644 --- a/src/disk_tool.h +++ b/src/disk_tool.h @@ -133,8 +133,6 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output); uint64_t sscanf_json(const char *fmt, const json11::Json & str); void fromhexstr(const std::string & from, int bytes, uint8_t *to); std::string realpath_str(std::string path, bool nofail = true); -std::string read_all_fd(int fd); -std::string read_file(std::string file, bool allow_enoent = false); int disable_cache(std::string dev); std::string get_parent_device(std::string dev); bool json_is_true(const json11::Json & val); diff --git a/src/disk_tool_utils.cpp b/src/disk_tool_utils.cpp index 5e11da86..8dfcfed0 100644 --- a/src/disk_tool_utils.cpp +++ b/src/disk_tool_utils.cpp @@ -55,23 +55,6 @@ std::string realpath_str(std::string path, bool nofail) return rp; } -std::string read_file(std::string file, bool allow_enoent) -{ - std::string res; - int fd = open(file.c_str(), O_RDONLY); - if (fd < 0 || (res = read_all_fd(fd)) == "") - { - int err = errno; - if (fd >= 0) - close(fd); - if (!allow_enoent || err != ENOENT) - fprintf(stderr, "Can't read %s: %s\n", file.c_str(), strerror(err)); - return ""; - } - close(fd); - return res; -} - // returns 1 = check error, 0 = write through, -1 = write back // (similar to 1 = warning, -1 = error, 0 = success in disable_cache) static int check_queue_cache(std::string dev, std::string parent_dev) diff --git a/src/nfs_block.cpp b/src/nfs_block.cpp index 39e56e07..1336fc21 100644 --- a/src/nfs_block.cpp +++ b/src/nfs_block.cpp @@ -34,7 +34,7 @@ static std::string get_inode_name(nfs_client_t *self, diropargs3 & what) std::string name = what.name; return (dir.size() ? dir+"/"+name - : self->parent->name_prefix+name); + : self->parent->blockfs->name_prefix+name); } static fattr3 get_dir_attributes(nfs_client_t *self, std::string dir) @@ -985,7 +985,7 @@ static void block_nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus) if (dir_it != self->parent->blockfs->dir_by_hash.end()) dir = dir_it->second; } - std::string prefix = dir.size() ? dir+"/" : self->parent->name_prefix; + std::string prefix = dir.size() ? dir+"/" : self->parent->blockfs->name_prefix; std::map entries; for (auto & ic: self->parent->cli->st_cli.inode_config) { @@ -1154,8 +1154,20 @@ static int block_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop) return 0; } -void block_fs_state_t::init(nfs_proxy_t *proxy) +void block_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg) { + name_prefix = cfg["subdir"].string_value(); + { + int e = name_prefix.size(); + while (e > 0 && name_prefix[e-1] == '/') + e--; + int s = 0; + while (s < e && name_prefix[s] == '/') + s++; + name_prefix = name_prefix.substr(s, e-s); + if (name_prefix.size()) + name_prefix += "/"; + } // We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long dir_info[""] = (nfs_dir_t){ .id = 1, @@ -1172,7 +1184,7 @@ void block_fs_state_t::init(nfs_proxy_t *proxy) } auto & inode_cfg = inode_cfg_it->second; std::string full_name = inode_cfg.name; - if (proxy->name_prefix != "" && full_name.substr(0, proxy->name_prefix.size()) != proxy->name_prefix) + if (proxy->blockfs->name_prefix != "" && full_name.substr(0, proxy->blockfs->name_prefix.size()) != proxy->blockfs->name_prefix) { return; } @@ -1181,7 +1193,7 @@ void block_fs_state_t::init(nfs_proxy_t *proxy) clock_gettime(CLOCK_REALTIME, &now); dir_info[""].mod_rev = dir_info[""].mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_info[""].mod_rev; dir_info[""].mtime = now; - int pos = full_name.find('/', proxy->name_prefix.size()); + int pos = full_name.find('/', proxy->blockfs->name_prefix.size()); while (pos >= 0) { std::string dir = full_name.substr(0, pos); diff --git a/src/nfs_block.h b/src/nfs_block.h index ac77bc72..7fcf9add 100644 --- a/src/nfs_block.h +++ b/src/nfs_block.h @@ -36,6 +36,8 @@ struct extend_inode_t struct block_fs_state_t { + std::string name_prefix; + // filehandle = "S"+base64(sha256(full name with prefix)) or "roothandle" for mount root) uint64_t next_dir_id = 2; // filehandle => dir with name_prefix @@ -51,7 +53,7 @@ struct block_fs_state_t std::map extends; std::multimap extend_writes; - void init(nfs_proxy_t *proxy); + void init(nfs_proxy_t *proxy, json11::Json cfg); }; nfsstat3 vitastor_nfs_map_err(int err); diff --git a/src/nfs_kv.cpp b/src/nfs_kv.cpp index 46166d54..60d74600 100644 --- a/src/nfs_kv.cpp +++ b/src/nfs_kv.cpp @@ -190,3 +190,72 @@ void nfs_kv_procs(nfs_client_t *self) self->proc_table.insert(pt[i]); } } + +void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg) +{ + // Check if we're using VitastorFS + fs_kv_inode = cfg["fs"].uint64_value(); + if (fs_kv_inode) + { + if (!INODE_POOL(fs_kv_inode)) + { + fprintf(stderr, "FS metadata inode number must include pool\n"); + exit(1); + } + } + else + { + for (auto & ic: proxy->cli->st_cli.inode_config) + { + if (ic.second.name == cfg["fs"].string_value()) + { + fs_kv_inode = ic.first; + break; + } + } + if (!fs_kv_inode) + { + fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str()); + exit(1); + } + } + readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value(); + if (!readdir_getattr_parallel) + readdir_getattr_parallel = 8; + id_alloc_batch_size = cfg["id_alloc_batch_size"].uint64_value(); + if (!id_alloc_batch_size) + id_alloc_batch_size = 200; + auto & pool_cfg = proxy->cli->st_cli.pool_config.at(proxy->default_pool_id); + pool_block_size = pool_cfg.pg_stripe_size; + pool_alignment = pool_cfg.bitmap_granularity; + // Open DB and wait + int open_res = 0; + bool open_done = false; + proxy->db = new kv_dbw_t(proxy->cli); + proxy->db->open(fs_kv_inode, cfg, [&](int res) + { + open_done = true; + open_res = res; + }); + while (!open_done) + { + proxy->ringloop->loop(); + if (open_done) + break; + proxy->ringloop->wait(); + } + if (open_res < 0) + { + fprintf(stderr, "Failed to open key/value filesystem metadata index: %s (code %d)\n", + strerror(-open_res), open_res); + exit(1); + } + fs_base_inode = ((uint64_t)proxy->default_pool_id << (64-POOL_ID_BITS)); + fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1; + shared_inode_threshold = pool_block_size; + if (!cfg["shared_inode_threshold"].is_null()) + { + shared_inode_threshold = cfg["shared_inode_threshold"].uint64_value(); + } + zero_block.resize(pool_block_size); +} diff --git a/src/nfs_kv.h b/src/nfs_kv.h index 0ce3b76b..e52e7ef3 100644 --- a/src/nfs_kv.h +++ b/src/nfs_kv.h @@ -45,6 +45,14 @@ struct kv_inode_extend_t struct kv_fs_state_t { + uint64_t fs_kv_inode = 0; + uint64_t fs_base_inode = 0; + uint64_t fs_inode_count = 0; + int readdir_getattr_parallel = 8, id_alloc_batch_size = 200; + uint64_t pool_block_size = 0; + uint64_t pool_alignment = 0; + uint64_t shared_inode_threshold = 0; + std::map list_cookies; uint64_t fs_next_id = 1, fs_allocated_id = 0; std::vector unallocated_ids; @@ -52,6 +60,8 @@ struct kv_fs_state_t uint64_t cur_shared_inode = 0, cur_shared_offset = 0; std::map extends; std::vector zero_block; + + void init(nfs_proxy_t *proxy, json11::Json cfg); }; struct shared_file_header_t diff --git a/src/nfs_kv_create.cpp b/src/nfs_kv_create.cpp index bd4ebec6..9047cf62 100644 --- a/src/nfs_kv_create.cpp +++ b/src/nfs_kv_create.cpp @@ -16,7 +16,7 @@ void allocate_new_id(nfs_client_t *self, std::functionparent->kvfs->fs_next_id++); return; } - else if (self->parent->kvfs->fs_next_id > self->parent->fs_inode_count) + else if (self->parent->kvfs->fs_next_id > self->parent->kvfs->fs_inode_count) { cb(-ENOSPC, 0); return; @@ -29,7 +29,7 @@ void allocate_new_id(nfs_client_t *self, std::function= self->parent->fs_inode_count) + if (prev_val >= self->parent->kvfs->fs_inode_count) { cb(-ENOSPC, 0); return; @@ -38,10 +38,10 @@ void allocate_new_id(nfs_client_t *self, std::functionparent->id_alloc_batch_size; - if (new_val >= self->parent->fs_inode_count) + uint64_t new_val = prev_val + self->parent->kvfs->id_alloc_batch_size; + if (new_val >= self->parent->kvfs->fs_inode_count) { - new_val = self->parent->fs_inode_count; + new_val = self->parent->kvfs->fs_inode_count; } self->parent->db->set(KV_NEXT_ID_KEY, std::to_string(new_val), [=](int res) { diff --git a/src/nfs_kv_read.cpp b/src/nfs_kv_read.cpp index aa36c887..a6be72f3 100644 --- a/src/nfs_kv_read.cpp +++ b/src/nfs_kv_read.cpp @@ -36,7 +36,7 @@ static void nfs_kv_continue_read(nfs_kv_read_state *st, int state) fprintf(stderr, "BUG: invalid state in nfs_kv_continue_read()"); abort(); } - if (st->offset + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold) + if (st->offset + sizeof(shared_file_header_t) < st->self->parent->kvfs->shared_inode_threshold) { kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs) { @@ -59,7 +59,7 @@ resume_1: st->buf = st->aligned_buf + sizeof(shared_file_header_t) + st->offset; st->op = new cluster_op_t; st->op->opcode = OSD_OP_READ; - st->op->inode = st->self->parent->fs_base_inode + st->ientry["shared_ino"].uint64_value(); + st->op->inode = st->self->parent->kvfs->fs_base_inode + st->ientry["shared_ino"].uint64_value(); st->op->offset = st->ientry["shared_offset"].uint64_value(); if (st->offset+st->size > st->ientry["size"].uint64_value()) { @@ -99,14 +99,14 @@ resume_2: return; } } - st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1)); - st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment-1) & - ~(st->self->parent->pool_alignment-1)) - st->aligned_offset; + st->aligned_offset = (st->offset & ~(st->self->parent->kvfs->pool_alignment-1)); + st->aligned_size = ((st->offset + st->size + st->self->parent->kvfs->pool_alignment-1) & + ~(st->self->parent->kvfs->pool_alignment-1)) - st->aligned_offset; st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); st->buf = st->aligned_buf + st->offset - st->aligned_offset; st->op = new cluster_op_t; st->op->opcode = OSD_OP_READ; - st->op->inode = st->self->parent->fs_base_inode + st->ino; + st->op->inode = st->self->parent->kvfs->fs_base_inode + st->ino; st->op->offset = st->aligned_offset; st->op->len = st->aligned_size; st->op->iov.push_back(st->aligned_buf, st->aligned_size); diff --git a/src/nfs_kv_readdir.cpp b/src/nfs_kv_readdir.cpp index 80289c82..28e91428 100644 --- a/src/nfs_kv_readdir.cpp +++ b/src/nfs_kv_readdir.cpp @@ -46,7 +46,7 @@ static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state); static void kv_getattr_next(nfs_kv_readdir_state *st) { - while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->readdir_getattr_parallel) + while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->kvfs->readdir_getattr_parallel) { auto idx = st->getattr_cur++; st->getattr_running++; diff --git a/src/nfs_kv_remove.cpp b/src/nfs_kv_remove.cpp index c2a0bf14..e99ec223 100644 --- a/src/nfs_kv_remove.cpp +++ b/src/nfs_kv_remove.cpp @@ -231,8 +231,8 @@ resume_6: { // Remove data st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { - { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) }, - { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) }, + { "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, + { "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, }), [st](const cli_result_t & r) { if (r.err) diff --git a/src/nfs_kv_rename.cpp b/src/nfs_kv_rename.cpp index 1f0da052..d6d018a6 100644 --- a/src/nfs_kv_rename.cpp +++ b/src/nfs_kv_rename.cpp @@ -278,8 +278,8 @@ resume_8: if (st->rm_dest_data) { st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { - { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->new_direntry["ino"].uint64_value()) }, - { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->new_direntry["ino"].uint64_value()) }, + { "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->new_direntry["ino"].uint64_value()) }, + { "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->new_direntry["ino"].uint64_value()) }, }), [st](const cli_result_t & r) { if (r.err) diff --git a/src/nfs_kv_setattr.cpp b/src/nfs_kv_setattr.cpp index edb24f56..5dbf1537 100644 --- a/src/nfs_kv_setattr.cpp +++ b/src/nfs_kv_setattr.cpp @@ -104,8 +104,8 @@ resume_2: { // Delete extra data when downsizing st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { - { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) }, - { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) }, + { "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, + { "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, { "min_offset", st->set_attrs["size"].uint64_value() }, }), [st](const cli_result_t & r) { diff --git a/src/nfs_kv_write.cpp b/src/nfs_kv_write.cpp index 8a920f8c..1d3485bd 100644 --- a/src/nfs_kv_write.cpp +++ b/src/nfs_kv_write.cpp @@ -8,6 +8,9 @@ #include "nfs_proxy.h" #include "nfs_kv.h" +// FIXME: Implement shared inode defragmentator +// FIXME: Implement fsck for vitastor-fs and for vitastor-kv + struct nfs_rmw_t { nfs_kv_write_state *st = NULL; @@ -67,7 +70,7 @@ static void finish_allocate_shared(nfs_client_t *self, int res) { w.st->shared_inode = self->parent->kvfs->cur_shared_inode; w.st->shared_offset = self->parent->kvfs->cur_shared_offset; - self->parent->kvfs->cur_shared_offset += (w.size + self->parent->pool_alignment-1) & ~(self->parent->pool_alignment-1); + self->parent->kvfs->cur_shared_offset += (w.size + self->parent->kvfs->pool_alignment-1) & ~(self->parent->kvfs->pool_alignment-1); } nfs_kv_continue_write(w.st, w.state); } @@ -113,22 +116,22 @@ static void allocate_shared_inode(nfs_kv_write_state *st, int state, uint64_t si st->res = 0; st->shared_inode = st->self->parent->kvfs->cur_shared_inode; st->shared_offset = st->self->parent->kvfs->cur_shared_offset; - st->self->parent->kvfs->cur_shared_offset += (size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1); + st->self->parent->kvfs->cur_shared_offset += (size + st->self->parent->kvfs->pool_alignment-1) & ~(st->self->parent->kvfs->pool_alignment-1); nfs_kv_continue_write(st, state); } } uint64_t align_shared_size(nfs_client_t *self, uint64_t size) { - return (size + sizeof(shared_file_header_t) + self->parent->pool_alignment-1) - & ~(self->parent->pool_alignment-1); + return (size + sizeof(shared_file_header_t) + self->parent->kvfs->pool_alignment-1) + & ~(self->parent->kvfs->pool_alignment-1); } static void nfs_do_write(uint64_t ino, uint64_t offset, uint64_t size, std::function prepare, nfs_kv_write_state *st, int state) { auto op = new cluster_op_t; op->opcode = OSD_OP_WRITE; - op->inode = st->self->parent->fs_base_inode + ino; + op->inode = st->self->parent->kvfs->fs_base_inode + ino; op->offset = offset; op->len = size; prepare(op); @@ -151,8 +154,8 @@ static void nfs_do_write(uint64_t ino, uint64_t offset, uint64_t size, std::func static void nfs_do_unshare_write(nfs_kv_write_state *st, int state) { - uint64_t unshare_size = (st->ientry["size"].uint64_value() + st->self->parent->pool_alignment-1) - & ~(st->self->parent->pool_alignment-1); + uint64_t unshare_size = (st->ientry["size"].uint64_value() + st->self->parent->kvfs->pool_alignment-1) + & ~(st->self->parent->kvfs->pool_alignment-1); nfs_do_write(st->ino, 0, unshare_size, [&](cluster_op_t *op) { op->iov.push_back(st->aligned_buf + sizeof(shared_file_header_t), unshare_size); @@ -162,16 +165,16 @@ static void nfs_do_unshare_write(nfs_kv_write_state *st, int state) static void nfs_do_rmw(nfs_rmw_t *rmw) { auto parent = rmw->st->self->parent; - auto align = parent->pool_alignment; + auto align = parent->kvfs->pool_alignment; assert(rmw->size < align); - assert((rmw->offset/parent->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->pool_block_size)); + assert((rmw->offset/parent->kvfs->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->kvfs->pool_block_size)); if (!rmw->part_buf) { rmw->part_buf = (uint8_t*)malloc_or_die(align); } auto op = new cluster_op_t; op->opcode = OSD_OP_READ; - op->inode = parent->fs_base_inode + rmw->ino; + op->inode = parent->kvfs->fs_base_inode + rmw->ino; op->offset = rmw->offset & ~(align-1); op->len = align; op->iov.push_back(rmw->part_buf, op->len); @@ -196,7 +199,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw) auto st = rmw->st; rmw->version = rd_op->version+1; if (st->rmw[0].st && st->rmw[1].st && - st->rmw[0].offset/st->self->parent->pool_block_size == st->rmw[1].offset/st->self->parent->pool_block_size) + st->rmw[0].offset/st->self->parent->kvfs->pool_block_size == st->rmw[1].offset/st->self->parent->kvfs->pool_block_size) { // Same block... RMWs should be sequential int other = rmw == &st->rmw[0] ? 1 : 0; @@ -204,12 +207,12 @@ static void nfs_do_rmw(nfs_rmw_t *rmw) } } auto parent = rmw->st->self->parent; - auto align = parent->pool_alignment; + auto align = parent->kvfs->pool_alignment; bool is_begin = (rmw->offset % align); bool is_end = ((rmw->offset+rmw->size) % align); auto op = new cluster_op_t; op->opcode = OSD_OP_WRITE; - op->inode = rmw->st->self->parent->fs_base_inode + rmw->ino; + op->inode = rmw->st->self->parent->kvfs->fs_base_inode + rmw->ino; op->offset = rmw->offset & ~(align-1); op->len = align; op->version = rmw->version; @@ -258,7 +261,7 @@ static void nfs_do_shared_read(nfs_kv_write_state *st, int state) { auto op = new cluster_op_t; op->opcode = OSD_OP_READ; - op->inode = st->self->parent->fs_base_inode + st->ientry["shared_ino"].uint64_value(); + op->inode = st->self->parent->kvfs->fs_base_inode + st->ientry["shared_ino"].uint64_value(); op->offset = st->ientry["shared_offset"].uint64_value(); op->len = align_shared_size(st->self, st->ientry["size"].uint64_value()); op->iov.push_back(st->aligned_buf, op->len); @@ -291,7 +294,7 @@ static bool nfs_do_shared_readmodify(nfs_kv_write_state *st, int base_state, int else if (state == base_state) goto resume_0; assert(!st->aligned_buf); st->aligned_size = unshare - ? sizeof(shared_file_header_t) + ((st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1)) + ? sizeof(shared_file_header_t) + ((st->new_size + st->self->parent->kvfs->pool_alignment-1) & ~(st->self->parent->kvfs->pool_alignment-1)) : align_shared_size(st->self, st->new_size); st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); // FIXME do not allocate zeroes if we only need zeroes @@ -351,7 +354,7 @@ static void nfs_do_shared_write(nfs_kv_write_state *st, int state, bool only_ali static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t offset, uint64_t shared_alloc, int state) { - auto alignment = st->self->parent->pool_alignment; + auto alignment = st->self->parent->kvfs->pool_alignment; uint64_t end = (offset+st->size); uint8_t *good_buf = st->buf; uint64_t good_offset = offset; @@ -667,18 +670,18 @@ resume_1: cb(st->res == 0 ? -EINVAL : st->res); return; } - st->was_immediate = st->self->parent->cli->get_immediate_commit(st->self->parent->fs_base_inode + st->ino); + st->was_immediate = st->self->parent->cli->get_immediate_commit(st->self->parent->kvfs->fs_base_inode + st->ino); st->new_size = st->ientry["size"].uint64_value(); if (st->new_size < st->offset + st->size) { st->new_size = st->offset + st->size; } - if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold) + if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->kvfs->shared_inode_threshold) { if (st->ientry["size"].uint64_value() == 0 && st->ientry["shared_ino"].uint64_value() == 0 || st->ientry["empty"].bool_value() && - (st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->shared_inode_threshold || + (st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->kvfs->shared_inode_threshold || st->ientry["shared_ino"].uint64_value() != 0 && st->ientry["shared_alloc"].uint64_value() < sizeof(shared_file_header_t)+st->offset+st->size) { diff --git a/src/nfs_proxy.cpp b/src/nfs_proxy.cpp index c98549eb..4fae4974 100644 --- a/src/nfs_proxy.cpp +++ b/src/nfs_proxy.cpp @@ -10,9 +10,10 @@ #include #include +#include #include #include -//#include +#include #include "nfs/nfs.h" #include "nfs/rpc.h" @@ -34,6 +35,10 @@ const char *exe_name = NULL; nfs_proxy_t::~nfs_proxy_t() { + if (kvfs) + delete kvfs; + if (blockfs) + delete blockfs; if (db) delete db; if (cmd) @@ -49,45 +54,79 @@ nfs_proxy_t::~nfs_proxy_t() delete ringloop; } +static const char* help_text = + "Vitastor NFS 3.0 proxy " VERSION "\n" + "(c) Vitaliy Filippov, 2021+ (VNPL-1.1)\n" + "\n" + "vitastor-nfs (--fs | --block) mount \n" + " Start local filesystem server and mount file system to .\n" + " Use regular `umount ` to unmount the FS.\n" + " The server will be automatically stopped when the FS is unmounted.\n" + "\n" + "vitastor-nfs (--fs | --block) start\n" + " Start network NFS server. Options:\n" + " --bind bind service to address (default 0.0.0.0)\n" + " --port use port for NFS services (default is 2049)\n" + " --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n" + "\n" + "OPTIONS:\n" + " --fs use VitastorFS with metadata in image \n" + " --block use pseudo-FS presenting images as files\n" + " --pool use as default pool for new files\n" + " --subdir export instead of root directory\n" + " --nfspath set NFS export path to (default is /)\n" + " --pidfile write process ID to the specified file\n" + " --logfile log to the specified file\n" + " --foreground 1 stay in foreground, do not daemonize\n" + "\n" + "NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n" + "you do not use client_enable_writeback=true, so you can freely use multiple\n" + "NFS proxies with L3 load balancing in this case.\n" + "\n" + "Example start and mount commands for a custom NFS port:\n" + " vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n" + " mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp\n" + "Or just:\n" + " vitastor-nfs mount --block --pool testpool /mnt/\n" +; + json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[]) { json11::Json::object cfg; + std::vector cmd; for (int i = 1; i < narg; i++) { if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help")) { - printf( - "Vitastor NFS 3.0 proxy\n" - "(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n" - "\n" - "USAGE:\n" - " %s [STANDARD OPTIONS] [OTHER OPTIONS]\n" - " --fs mount VitastorFS with metadata in image \n" - " --subdir export images prefixed / (default empty - export all images)\n" - " --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n" - " --bind bind service to address (default 0.0.0.0)\n" - " --nfspath set NFS export path to (default is /)\n" - " --port use port for NFS services (default is 2049)\n" - " --pool use as default pool for new files (images)\n" - " --logfile log to the specified file\n" - " --foreground 1 stay in foreground, do not daemonize\n" - "\n" - "NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n" - "you do not use client_enable_writeback=true, so you can freely use multiple\n" - "NFS proxies with L3 load balancing in this case.\n" - "\n" - "Example start and mount commands for a custom NFS port:\n" - " %s --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n" - " mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp\n", - exe_name, exe_name - ); + printf("%s", help_text); exit(0); } else if (args[i][0] == '-' && args[i][1] == '-') { const char *opt = args[i]+2; - cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i]; + cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "block") || i == narg-1 ? "1" : args[++i]; } + else + { + cmd.push_back(args[i]); + } + } + if (cfg.find("block") == cfg.end() && cfg.find("fs") == cfg.end()) + { + fprintf(stderr, "Specify one of --block or --fs NAME. Use vitastor-nfs --help for details\n"); + exit(1); + } + if (cmd.size() >= 2 && cmd[0] == "mount") + { + cfg["mount"] = cmd[1]; + } + else if (cmd.size() >= 1 && cmd[0] == "start") + { + } + else + { + printf("%s", help_text); + exit(1); } return cfg; } @@ -101,6 +140,7 @@ void nfs_proxy_t::run(json11::Json cfg) // Parse options if (cfg["logfile"].string_value() != "") logfile = cfg["logfile"].string_value(); + pidfile = cfg["pidfile"].string_value(); trace = cfg["log_level"].uint64_value() > 5 || cfg["trace"].uint64_value() > 0; bind_address = cfg["bind"].string_value(); if (bind_address == "") @@ -113,18 +153,6 @@ void nfs_proxy_t::run(json11::Json cfg) export_root = cfg["nfspath"].string_value(); if (!export_root.size()) export_root = "/"; - name_prefix = cfg["subdir"].string_value(); - { - int e = name_prefix.size(); - while (e > 0 && name_prefix[e-1] == '/') - e--; - int s = 0; - while (s < e && name_prefix[s] == '/') - s++; - name_prefix = name_prefix.substr(s, e-s); - if (name_prefix.size()) - name_prefix += "/"; - } if (cfg["client_writeback_allowed"].is_null()) { // NFS is always aware of fsync, so we allow write-back cache @@ -133,6 +161,14 @@ void nfs_proxy_t::run(json11::Json cfg) obj["client_writeback_allowed"] = true; cfg = obj; } + mountpoint = cfg["mount"].string_value(); + if (mountpoint != "") + { + bind_address = "127.0.0.1"; + nfs_port = 0; + portmap_enabled = false; + exit_on_umount = true; + } // Create client ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE); epmgr = new epoll_manager_t(ringloop); @@ -142,11 +178,6 @@ void nfs_proxy_t::run(json11::Json cfg) cmd->epmgr = epmgr; cmd->cli = cli; watch_stats(); - if (!fs_kv_inode) - { - blockfs = new block_fs_state_t(); - blockfs->init(this); - } // Load image metadata while (!cli->is_ready()) { @@ -158,70 +189,15 @@ void nfs_proxy_t::run(json11::Json cfg) // Check default pool check_default_pool(); // Check if we're using VitastorFS - fs_kv_inode = cfg["fs"].uint64_value(); - if (fs_kv_inode) + if (cfg["fs"].is_null()) { - if (!INODE_POOL(fs_kv_inode)) - { - fprintf(stderr, "FS metadata inode number must include pool\n"); - exit(1); - } + blockfs = new block_fs_state_t(); + blockfs->init(this, cfg); } - else if (cfg["fs"].is_string()) + else { - for (auto & ic: cli->st_cli.inode_config) - { - if (ic.second.name == cfg["fs"].string_value()) - { - fs_kv_inode = ic.first; - break; - } - } - if (!fs_kv_inode) - { - fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str()); - exit(1); - } - } - readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value(); - if (!readdir_getattr_parallel) - readdir_getattr_parallel = 8; - id_alloc_batch_size = cfg["id_alloc_batch_size"].uint64_value(); - if (!id_alloc_batch_size) - id_alloc_batch_size = 200; - if (fs_kv_inode) - { - // Open DB and wait - int open_res = 0; - bool open_done = false; - db = new kv_dbw_t(cli); - db->open(fs_kv_inode, cfg, [&](int res) - { - open_done = true; - open_res = res; - }); - while (!open_done) - { - ringloop->loop(); - if (open_done) - break; - ringloop->wait(); - } - if (open_res < 0) - { - fprintf(stderr, "Failed to open key/value filesystem metadata index: %s (code %d)\n", - strerror(-open_res), open_res); - exit(1); - } - fs_base_inode = ((uint64_t)default_pool_id << (64-POOL_ID_BITS)); - fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1; - shared_inode_threshold = pool_block_size; - if (!cfg["shared_inode_threshold"].is_null()) - { - shared_inode_threshold = cfg["shared_inode_threshold"].uint64_value(); - } - kvfs = new kv_fs_state_t; - kvfs->zero_block.resize(pool_block_size); + kvfs = new kv_fs_state_t(); + kvfs->init(this, cfg); } // Self-register portmap and NFS pmap.reg_ports.insert((portmap_id_t){ @@ -253,7 +229,7 @@ void nfs_proxy_t::run(json11::Json cfg) .addr = "0.0.0.0.0."+std::to_string(nfs_port), }); // Create NFS socket and add it to epoll - int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, NULL); + int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, &listening_port); fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK); epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events) { @@ -285,24 +261,43 @@ void nfs_proxy_t::run(json11::Json cfg) } }); } + if (mountpoint != "") + { + mount_fs(); + } if (cfg["foreground"].is_null()) { daemonize(); } - while (true) + if (pidfile != "") + { + write_pid(); + } + while (!finished) { ringloop->loop(); ringloop->wait(); } // Destroy the client cli->flush(); - delete kvfs; - delete db; + if (kvfs) + { + delete kvfs; + kvfs = NULL; + } + if (blockfs) + { + delete blockfs; + blockfs = NULL; + } + if (db) + { + delete db; + db = NULL; + } delete cli; delete epmgr; delete ringloop; - kvfs = NULL; - db = NULL; cli = NULL; epmgr = NULL; ringloop = NULL; @@ -410,8 +405,6 @@ void nfs_proxy_t::check_default_pool() auto pool_it = cli->st_cli.pool_config.begin(); default_pool_id = pool_it->first; default_pool = pool_it->second.name; - pool_block_size = pool_it->second.pg_stripe_size; - pool_alignment = pool_it->second.bitmap_granularity; } else { @@ -426,8 +419,6 @@ void nfs_proxy_t::check_default_pool() if (p.second.name == default_pool) { default_pool_id = p.first; - pool_block_size = p.second.pg_stripe_size; - pool_alignment = p.second.bitmap_granularity; break; } } @@ -447,11 +438,12 @@ void nfs_proxy_t::do_accept(int listen_fd) while ((nfs_fd = accept(listen_fd, (struct sockaddr *)&addr, &addr_size)) >= 0) { fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str()); + active_connections++; fcntl(nfs_fd, F_SETFL, fcntl(nfs_fd, F_GETFL, 0) | O_NONBLOCK); int one = 1; setsockopt(nfs_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); auto cli = new nfs_client_t(); - if (fs_kv_inode) + if (kvfs) nfs_kv_procs(cli); else nfs_block_procs(cli); @@ -468,6 +460,8 @@ void nfs_proxy_t::do_accept(int listen_fd) { fprintf(stderr, "Client %d disconnected\n", nfs_fd); cli->stop(); + cli->parent->active_connections--; + cli->parent->check_exit(); return; } cli->epoll_events |= epoll_events; @@ -1006,6 +1000,109 @@ void nfs_proxy_t::daemonize() open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666); } +void nfs_proxy_t::write_pid() +{ + int fd = open(pidfile.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666); + if (fd < 0) + { + fprintf(stderr, "Failed to create pid file %s: %s (code %d)\n", pidfile.c_str(), strerror(errno), errno); + return; + } + auto pid = std::to_string(getpid()); + if (write(fd, pid.c_str(), pid.size()) < 0) + { + fprintf(stderr, "Failed to write pid to %s: %s (code %d)\n", pidfile.c_str(), strerror(errno), errno); + } + close(fd); +} + +static pid_t wanted_pid = 0; +static bool child_finished = false; +static int child_status = -1; + +void single_child_handler(int signal) +{ + child_finished = true; + waitpid(wanted_pid, &child_status, WNOHANG); +} + +void nfs_proxy_t::mount_fs() +{ + signal(SIGCHLD, single_child_handler); + auto pid = fork(); + if (pid < 0) + { + fprintf(stderr, "Failed to fork: %s (code %d)\n", strerror(errno), errno); + exit(1); + } + if (pid > 0) + { + // Parent - loop and wait until child finishes + wanted_pid = pid; + while (!child_finished) + { + ringloop->loop(); + ringloop->wait(); + } + if (!WIFEXITED(child_status) || WEXITSTATUS(child_status) != 0) + { + // Mounting failed + exit(1); + } + } + else + { + // Child + std::string src = ("localhost:"+export_root); + std::string opts = ("port="+std::to_string(listening_port)+",mountport="+std::to_string(listening_port)+",nfsvers=3,soft,nolock,tcp"); + const char *args[] = { "mount", src.c_str(), mountpoint.c_str(), "-o", opts.c_str(), NULL }; + execvp("mount", (char* const*)args); + fprintf(stderr, "Failed to run mount %s %s -o %s: %s (code %d)\n", + src.c_str(), mountpoint.c_str(), opts.c_str(), strerror(errno), errno); + exit(1); + } +} + +void nfs_proxy_t::check_exit() +{ + if (active_connections || !exit_on_umount) + { + return; + } + std::string mountstr = read_file("/proc/mounts"); + if (mountstr == "") + { + return; + } + auto port_opt = "port="+std::to_string(listening_port); + auto mountport_opt = "port="+std::to_string(listening_port); + auto mounts = explode("\n", mountstr, true); + for (auto & str: mounts) + { + auto opts = explode(" ", str, true); + if (opts[2].size() >= 3 && opts[2].substr(0, 3) == "nfs" && opts.size() >= 4) + { + opts = explode(",", opts[3], true); + bool port_found = false; + bool addr_found = false; + for (auto & opt: opts) + { + if (opt == port_opt || opt == mountport_opt) + port_found = true; + if (opt == "addr=127.0.0.1" || opt == "mountaddr=127.0.0.1") + addr_found = true; + } + if (port_found && addr_found) + { + // OK, do not unmount + return; + } + } + } + // Not found, unmount + finished = true; +} + int main(int narg, const char *args[]) { setvbuf(stdout, NULL, _IONBF, 0); diff --git a/src/nfs_proxy.h b/src/nfs_proxy.h index 872ddb55..e7dd1f78 100644 --- a/src/nfs_proxy.h +++ b/src/nfs_proxy.h @@ -21,24 +21,22 @@ class nfs_proxy_t { public: std::string bind_address; - std::string name_prefix; uint64_t fsid = 1; uint64_t server_id = 0; std::string default_pool; std::string export_root; bool portmap_enabled; unsigned nfs_port; - uint64_t fs_kv_inode = 0; - uint64_t fs_base_inode = 0; - uint64_t fs_inode_count = 0; - int readdir_getattr_parallel = 8, id_alloc_batch_size = 200; int trace = 0; std::string logfile = "/dev/null"; + std::string pidfile; + bool exit_on_umount = false; + std::string mountpoint; - pool_id_t default_pool_id; - uint64_t pool_block_size = 0; - uint64_t pool_alignment = 0; - uint64_t shared_inode_threshold = 0; + int active_connections = 0; + bool finished = false; + int listening_port = 0; + pool_id_t default_pool_id = 0; portmap_service_t pmap; ring_loop_t *ringloop = NULL; @@ -65,6 +63,9 @@ public: void check_default_pool(); void do_accept(int listen_fd); void daemonize(); + void write_pid(); + void mount_fs(); + void check_exit(); }; struct rpc_cur_buffer_t diff --git a/src/str_util.cpp b/src/str_util.cpp index ee3bbb5c..d96b9303 100644 --- a/src/str_util.cpp +++ b/src/str_util.cpp @@ -1,9 +1,10 @@ // Copyright (c) Vitaliy Filippov, 2019+ -// License: VNPL-1.1 (see README.md for details) +// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details) #include #include #include +#include #include "str_util.h" std::string base64_encode(const std::string &in) @@ -304,6 +305,23 @@ std::string read_all_fd(int fd) return res; } +std::string read_file(std::string file, bool allow_enoent) +{ + std::string res; + int fd = open(file.c_str(), O_RDONLY); + if (fd < 0 || (res = read_all_fd(fd)) == "") + { + int err = errno; + if (fd >= 0) + close(fd); + if (!allow_enoent || err != ENOENT) + fprintf(stderr, "Failed to read %s: %s (code %d)\n", file.c_str(), strerror(err), err); + return ""; + } + close(fd); + return res; +} + std::string str_repeat(const std::string & str, int times) { std::string r; diff --git a/src/str_util.h b/src/str_util.h index 23c0d8dd..9c1bd8ed 100644 --- a/src/str_util.h +++ b/src/str_util.h @@ -1,5 +1,5 @@ // Copyright (c) Vitaliy Filippov, 2019+ -// License: VNPL-1.1 (see README.md for details) +// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details) #pragma once #include @@ -18,6 +18,7 @@ std::string format_size(uint64_t size, bool nobytes = false); void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all); uint64_t parse_time(std::string time_str, bool *ok = NULL); std::string read_all_fd(int fd); +std::string read_file(std::string file, bool allow_enoent = false); std::string str_repeat(const std::string & str, int times); size_t utf8_length(const std::string & s); size_t utf8_length(const char *s);