Implement auto-unmount local NFS server mode for vitastor-nfs

master
Vitaliy Filippov 2024-03-08 14:35:54 +03:00
parent 57605a5c13
commit f600ce98e2
18 changed files with 396 additions and 193 deletions

View File

@ -133,8 +133,6 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output);
uint64_t sscanf_json(const char *fmt, const json11::Json & str);
void fromhexstr(const std::string & from, int bytes, uint8_t *to);
std::string realpath_str(std::string path, bool nofail = true);
std::string read_all_fd(int fd);
std::string read_file(std::string file, bool allow_enoent = false);
int disable_cache(std::string dev);
std::string get_parent_device(std::string dev);
bool json_is_true(const json11::Json & val);

View File

@ -55,23 +55,6 @@ std::string realpath_str(std::string path, bool nofail)
return rp;
}
std::string read_file(std::string file, bool allow_enoent)
{
std::string res;
int fd = open(file.c_str(), O_RDONLY);
if (fd < 0 || (res = read_all_fd(fd)) == "")
{
int err = errno;
if (fd >= 0)
close(fd);
if (!allow_enoent || err != ENOENT)
fprintf(stderr, "Can't read %s: %s\n", file.c_str(), strerror(err));
return "";
}
close(fd);
return res;
}
// returns 1 = check error, 0 = write through, -1 = write back
// (similar to 1 = warning, -1 = error, 0 = success in disable_cache)
static int check_queue_cache(std::string dev, std::string parent_dev)

View File

@ -34,7 +34,7 @@ static std::string get_inode_name(nfs_client_t *self, diropargs3 & what)
std::string name = what.name;
return (dir.size()
? dir+"/"+name
: self->parent->name_prefix+name);
: self->parent->blockfs->name_prefix+name);
}
static fattr3 get_dir_attributes(nfs_client_t *self, std::string dir)
@ -985,7 +985,7 @@ static void block_nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
if (dir_it != self->parent->blockfs->dir_by_hash.end())
dir = dir_it->second;
}
std::string prefix = dir.size() ? dir+"/" : self->parent->name_prefix;
std::string prefix = dir.size() ? dir+"/" : self->parent->blockfs->name_prefix;
std::map<std::string, struct entryplus3> entries;
for (auto & ic: self->parent->cli->st_cli.inode_config)
{
@ -1154,8 +1154,20 @@ static int block_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop)
return 0;
}
void block_fs_state_t::init(nfs_proxy_t *proxy)
void block_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
{
name_prefix = cfg["subdir"].string_value();
{
int e = name_prefix.size();
while (e > 0 && name_prefix[e-1] == '/')
e--;
int s = 0;
while (s < e && name_prefix[s] == '/')
s++;
name_prefix = name_prefix.substr(s, e-s);
if (name_prefix.size())
name_prefix += "/";
}
// We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
dir_info[""] = (nfs_dir_t){
.id = 1,
@ -1172,7 +1184,7 @@ void block_fs_state_t::init(nfs_proxy_t *proxy)
}
auto & inode_cfg = inode_cfg_it->second;
std::string full_name = inode_cfg.name;
if (proxy->name_prefix != "" && full_name.substr(0, proxy->name_prefix.size()) != proxy->name_prefix)
if (proxy->blockfs->name_prefix != "" && full_name.substr(0, proxy->blockfs->name_prefix.size()) != proxy->blockfs->name_prefix)
{
return;
}
@ -1181,7 +1193,7 @@ void block_fs_state_t::init(nfs_proxy_t *proxy)
clock_gettime(CLOCK_REALTIME, &now);
dir_info[""].mod_rev = dir_info[""].mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_info[""].mod_rev;
dir_info[""].mtime = now;
int pos = full_name.find('/', proxy->name_prefix.size());
int pos = full_name.find('/', proxy->blockfs->name_prefix.size());
while (pos >= 0)
{
std::string dir = full_name.substr(0, pos);

View File

@ -36,6 +36,8 @@ struct extend_inode_t
struct block_fs_state_t
{
std::string name_prefix;
// filehandle = "S"+base64(sha256(full name with prefix)) or "roothandle" for mount root)
uint64_t next_dir_id = 2;
// filehandle => dir with name_prefix
@ -51,7 +53,7 @@ struct block_fs_state_t
std::map<inode_t, extend_inode_t> extends;
std::multimap<extend_size_t, extend_write_t> extend_writes;
void init(nfs_proxy_t *proxy);
void init(nfs_proxy_t *proxy, json11::Json cfg);
};
nfsstat3 vitastor_nfs_map_err(int err);

View File

@ -190,3 +190,72 @@ void nfs_kv_procs(nfs_client_t *self)
self->proc_table.insert(pt[i]);
}
}
void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
{
// Check if we're using VitastorFS
fs_kv_inode = cfg["fs"].uint64_value();
if (fs_kv_inode)
{
if (!INODE_POOL(fs_kv_inode))
{
fprintf(stderr, "FS metadata inode number must include pool\n");
exit(1);
}
}
else
{
for (auto & ic: proxy->cli->st_cli.inode_config)
{
if (ic.second.name == cfg["fs"].string_value())
{
fs_kv_inode = ic.first;
break;
}
}
if (!fs_kv_inode)
{
fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str());
exit(1);
}
}
readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value();
if (!readdir_getattr_parallel)
readdir_getattr_parallel = 8;
id_alloc_batch_size = cfg["id_alloc_batch_size"].uint64_value();
if (!id_alloc_batch_size)
id_alloc_batch_size = 200;
auto & pool_cfg = proxy->cli->st_cli.pool_config.at(proxy->default_pool_id);
pool_block_size = pool_cfg.pg_stripe_size;
pool_alignment = pool_cfg.bitmap_granularity;
// Open DB and wait
int open_res = 0;
bool open_done = false;
proxy->db = new kv_dbw_t(proxy->cli);
proxy->db->open(fs_kv_inode, cfg, [&](int res)
{
open_done = true;
open_res = res;
});
while (!open_done)
{
proxy->ringloop->loop();
if (open_done)
break;
proxy->ringloop->wait();
}
if (open_res < 0)
{
fprintf(stderr, "Failed to open key/value filesystem metadata index: %s (code %d)\n",
strerror(-open_res), open_res);
exit(1);
}
fs_base_inode = ((uint64_t)proxy->default_pool_id << (64-POOL_ID_BITS));
fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1;
shared_inode_threshold = pool_block_size;
if (!cfg["shared_inode_threshold"].is_null())
{
shared_inode_threshold = cfg["shared_inode_threshold"].uint64_value();
}
zero_block.resize(pool_block_size);
}

View File

@ -45,6 +45,14 @@ struct kv_inode_extend_t
struct kv_fs_state_t
{
uint64_t fs_kv_inode = 0;
uint64_t fs_base_inode = 0;
uint64_t fs_inode_count = 0;
int readdir_getattr_parallel = 8, id_alloc_batch_size = 200;
uint64_t pool_block_size = 0;
uint64_t pool_alignment = 0;
uint64_t shared_inode_threshold = 0;
std::map<list_cookie_t, list_cookie_val_t> list_cookies;
uint64_t fs_next_id = 1, fs_allocated_id = 0;
std::vector<uint64_t> unallocated_ids;
@ -52,6 +60,8 @@ struct kv_fs_state_t
uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
std::map<inode_t, kv_inode_extend_t> extends;
std::vector<uint8_t> zero_block;
void init(nfs_proxy_t *proxy, json11::Json cfg);
};
struct shared_file_header_t

View File

@ -16,7 +16,7 @@ void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t ne
cb(0, self->parent->kvfs->fs_next_id++);
return;
}
else if (self->parent->kvfs->fs_next_id > self->parent->fs_inode_count)
else if (self->parent->kvfs->fs_next_id > self->parent->kvfs->fs_inode_count)
{
cb(-ENOSPC, 0);
return;
@ -29,7 +29,7 @@ void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t ne
return;
}
uint64_t prev_val = stoull_full(prev_str);
if (prev_val >= self->parent->fs_inode_count)
if (prev_val >= self->parent->kvfs->fs_inode_count)
{
cb(-ENOSPC, 0);
return;
@ -38,10 +38,10 @@ void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t ne
{
prev_val = 1;
}
uint64_t new_val = prev_val + self->parent->id_alloc_batch_size;
if (new_val >= self->parent->fs_inode_count)
uint64_t new_val = prev_val + self->parent->kvfs->id_alloc_batch_size;
if (new_val >= self->parent->kvfs->fs_inode_count)
{
new_val = self->parent->fs_inode_count;
new_val = self->parent->kvfs->fs_inode_count;
}
self->parent->db->set(KV_NEXT_ID_KEY, std::to_string(new_val), [=](int res)
{

View File

@ -36,7 +36,7 @@ static void nfs_kv_continue_read(nfs_kv_read_state *st, int state)
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_read()");
abort();
}
if (st->offset + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold)
if (st->offset + sizeof(shared_file_header_t) < st->self->parent->kvfs->shared_inode_threshold)
{
kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs)
{
@ -59,7 +59,7 @@ resume_1:
st->buf = st->aligned_buf + sizeof(shared_file_header_t) + st->offset;
st->op = new cluster_op_t;
st->op->opcode = OSD_OP_READ;
st->op->inode = st->self->parent->fs_base_inode + st->ientry["shared_ino"].uint64_value();
st->op->inode = st->self->parent->kvfs->fs_base_inode + st->ientry["shared_ino"].uint64_value();
st->op->offset = st->ientry["shared_offset"].uint64_value();
if (st->offset+st->size > st->ientry["size"].uint64_value())
{
@ -99,14 +99,14 @@ resume_2:
return;
}
}
st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1));
st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment-1) &
~(st->self->parent->pool_alignment-1)) - st->aligned_offset;
st->aligned_offset = (st->offset & ~(st->self->parent->kvfs->pool_alignment-1));
st->aligned_size = ((st->offset + st->size + st->self->parent->kvfs->pool_alignment-1) &
~(st->self->parent->kvfs->pool_alignment-1)) - st->aligned_offset;
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
st->buf = st->aligned_buf + st->offset - st->aligned_offset;
st->op = new cluster_op_t;
st->op->opcode = OSD_OP_READ;
st->op->inode = st->self->parent->fs_base_inode + st->ino;
st->op->inode = st->self->parent->kvfs->fs_base_inode + st->ino;
st->op->offset = st->aligned_offset;
st->op->len = st->aligned_size;
st->op->iov.push_back(st->aligned_buf, st->aligned_size);

View File

@ -46,7 +46,7 @@ static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state);
static void kv_getattr_next(nfs_kv_readdir_state *st)
{
while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->readdir_getattr_parallel)
while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->kvfs->readdir_getattr_parallel)
{
auto idx = st->getattr_cur++;
st->getattr_running++;

View File

@ -231,8 +231,8 @@ resume_6:
{
// Remove data
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
{ "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) },
{ "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) },
}), [st](const cli_result_t & r)
{
if (r.err)

View File

@ -278,8 +278,8 @@ resume_8:
if (st->rm_dest_data)
{
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
{ "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->new_direntry["ino"].uint64_value()) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->new_direntry["ino"].uint64_value()) },
{ "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->new_direntry["ino"].uint64_value()) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->new_direntry["ino"].uint64_value()) },
}), [st](const cli_result_t & r)
{
if (r.err)

View File

@ -104,8 +104,8 @@ resume_2:
{
// Delete extra data when downsizing
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
{ "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) },
{ "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) },
{ "min_offset", st->set_attrs["size"].uint64_value() },
}), [st](const cli_result_t & r)
{

View File

@ -8,6 +8,9 @@
#include "nfs_proxy.h"
#include "nfs_kv.h"
// FIXME: Implement shared inode defragmentator
// FIXME: Implement fsck for vitastor-fs and for vitastor-kv
struct nfs_rmw_t
{
nfs_kv_write_state *st = NULL;
@ -67,7 +70,7 @@ static void finish_allocate_shared(nfs_client_t *self, int res)
{
w.st->shared_inode = self->parent->kvfs->cur_shared_inode;
w.st->shared_offset = self->parent->kvfs->cur_shared_offset;
self->parent->kvfs->cur_shared_offset += (w.size + self->parent->pool_alignment-1) & ~(self->parent->pool_alignment-1);
self->parent->kvfs->cur_shared_offset += (w.size + self->parent->kvfs->pool_alignment-1) & ~(self->parent->kvfs->pool_alignment-1);
}
nfs_kv_continue_write(w.st, w.state);
}
@ -113,22 +116,22 @@ static void allocate_shared_inode(nfs_kv_write_state *st, int state, uint64_t si
st->res = 0;
st->shared_inode = st->self->parent->kvfs->cur_shared_inode;
st->shared_offset = st->self->parent->kvfs->cur_shared_offset;
st->self->parent->kvfs->cur_shared_offset += (size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1);
st->self->parent->kvfs->cur_shared_offset += (size + st->self->parent->kvfs->pool_alignment-1) & ~(st->self->parent->kvfs->pool_alignment-1);
nfs_kv_continue_write(st, state);
}
}
uint64_t align_shared_size(nfs_client_t *self, uint64_t size)
{
return (size + sizeof(shared_file_header_t) + self->parent->pool_alignment-1)
& ~(self->parent->pool_alignment-1);
return (size + sizeof(shared_file_header_t) + self->parent->kvfs->pool_alignment-1)
& ~(self->parent->kvfs->pool_alignment-1);
}
static void nfs_do_write(uint64_t ino, uint64_t offset, uint64_t size, std::function<void(cluster_op_t *op)> prepare, nfs_kv_write_state *st, int state)
{
auto op = new cluster_op_t;
op->opcode = OSD_OP_WRITE;
op->inode = st->self->parent->fs_base_inode + ino;
op->inode = st->self->parent->kvfs->fs_base_inode + ino;
op->offset = offset;
op->len = size;
prepare(op);
@ -151,8 +154,8 @@ static void nfs_do_write(uint64_t ino, uint64_t offset, uint64_t size, std::func
static void nfs_do_unshare_write(nfs_kv_write_state *st, int state)
{
uint64_t unshare_size = (st->ientry["size"].uint64_value() + st->self->parent->pool_alignment-1)
& ~(st->self->parent->pool_alignment-1);
uint64_t unshare_size = (st->ientry["size"].uint64_value() + st->self->parent->kvfs->pool_alignment-1)
& ~(st->self->parent->kvfs->pool_alignment-1);
nfs_do_write(st->ino, 0, unshare_size, [&](cluster_op_t *op)
{
op->iov.push_back(st->aligned_buf + sizeof(shared_file_header_t), unshare_size);
@ -162,16 +165,16 @@ static void nfs_do_unshare_write(nfs_kv_write_state *st, int state)
static void nfs_do_rmw(nfs_rmw_t *rmw)
{
auto parent = rmw->st->self->parent;
auto align = parent->pool_alignment;
auto align = parent->kvfs->pool_alignment;
assert(rmw->size < align);
assert((rmw->offset/parent->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->pool_block_size));
assert((rmw->offset/parent->kvfs->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->kvfs->pool_block_size));
if (!rmw->part_buf)
{
rmw->part_buf = (uint8_t*)malloc_or_die(align);
}
auto op = new cluster_op_t;
op->opcode = OSD_OP_READ;
op->inode = parent->fs_base_inode + rmw->ino;
op->inode = parent->kvfs->fs_base_inode + rmw->ino;
op->offset = rmw->offset & ~(align-1);
op->len = align;
op->iov.push_back(rmw->part_buf, op->len);
@ -196,7 +199,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
auto st = rmw->st;
rmw->version = rd_op->version+1;
if (st->rmw[0].st && st->rmw[1].st &&
st->rmw[0].offset/st->self->parent->pool_block_size == st->rmw[1].offset/st->self->parent->pool_block_size)
st->rmw[0].offset/st->self->parent->kvfs->pool_block_size == st->rmw[1].offset/st->self->parent->kvfs->pool_block_size)
{
// Same block... RMWs should be sequential
int other = rmw == &st->rmw[0] ? 1 : 0;
@ -204,12 +207,12 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
}
}
auto parent = rmw->st->self->parent;
auto align = parent->pool_alignment;
auto align = parent->kvfs->pool_alignment;
bool is_begin = (rmw->offset % align);
bool is_end = ((rmw->offset+rmw->size) % align);
auto op = new cluster_op_t;
op->opcode = OSD_OP_WRITE;
op->inode = rmw->st->self->parent->fs_base_inode + rmw->ino;
op->inode = rmw->st->self->parent->kvfs->fs_base_inode + rmw->ino;
op->offset = rmw->offset & ~(align-1);
op->len = align;
op->version = rmw->version;
@ -258,7 +261,7 @@ static void nfs_do_shared_read(nfs_kv_write_state *st, int state)
{
auto op = new cluster_op_t;
op->opcode = OSD_OP_READ;
op->inode = st->self->parent->fs_base_inode + st->ientry["shared_ino"].uint64_value();
op->inode = st->self->parent->kvfs->fs_base_inode + st->ientry["shared_ino"].uint64_value();
op->offset = st->ientry["shared_offset"].uint64_value();
op->len = align_shared_size(st->self, st->ientry["size"].uint64_value());
op->iov.push_back(st->aligned_buf, op->len);
@ -291,7 +294,7 @@ static bool nfs_do_shared_readmodify(nfs_kv_write_state *st, int base_state, int
else if (state == base_state) goto resume_0;
assert(!st->aligned_buf);
st->aligned_size = unshare
? sizeof(shared_file_header_t) + ((st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1))
? sizeof(shared_file_header_t) + ((st->new_size + st->self->parent->kvfs->pool_alignment-1) & ~(st->self->parent->kvfs->pool_alignment-1))
: align_shared_size(st->self, st->new_size);
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
// FIXME do not allocate zeroes if we only need zeroes
@ -351,7 +354,7 @@ static void nfs_do_shared_write(nfs_kv_write_state *st, int state, bool only_ali
static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t offset, uint64_t shared_alloc, int state)
{
auto alignment = st->self->parent->pool_alignment;
auto alignment = st->self->parent->kvfs->pool_alignment;
uint64_t end = (offset+st->size);
uint8_t *good_buf = st->buf;
uint64_t good_offset = offset;
@ -667,18 +670,18 @@ resume_1:
cb(st->res == 0 ? -EINVAL : st->res);
return;
}
st->was_immediate = st->self->parent->cli->get_immediate_commit(st->self->parent->fs_base_inode + st->ino);
st->was_immediate = st->self->parent->cli->get_immediate_commit(st->self->parent->kvfs->fs_base_inode + st->ino);
st->new_size = st->ientry["size"].uint64_value();
if (st->new_size < st->offset + st->size)
{
st->new_size = st->offset + st->size;
}
if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold)
if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->kvfs->shared_inode_threshold)
{
if (st->ientry["size"].uint64_value() == 0 &&
st->ientry["shared_ino"].uint64_value() == 0 ||
st->ientry["empty"].bool_value() &&
(st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->shared_inode_threshold ||
(st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->kvfs->shared_inode_threshold ||
st->ientry["shared_ino"].uint64_value() != 0 &&
st->ientry["shared_alloc"].uint64_value() < sizeof(shared_file_header_t)+st->offset+st->size)
{

View File

@ -10,9 +10,10 @@
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <sys/wait.h>
#include <unistd.h>
#include <fcntl.h>
//#include <signal.h>
#include <signal.h>
#include "nfs/nfs.h"
#include "nfs/rpc.h"
@ -34,6 +35,10 @@ const char *exe_name = NULL;
nfs_proxy_t::~nfs_proxy_t()
{
if (kvfs)
delete kvfs;
if (blockfs)
delete blockfs;
if (db)
delete db;
if (cmd)
@ -49,45 +54,79 @@ nfs_proxy_t::~nfs_proxy_t()
delete ringloop;
}
static const char* help_text =
"Vitastor NFS 3.0 proxy " VERSION "\n"
"(c) Vitaliy Filippov, 2021+ (VNPL-1.1)\n"
"\n"
"vitastor-nfs (--fs <NAME> | --block) mount <MOUNTPOINT>\n"
" Start local filesystem server and mount file system to <MOUNTPOINT>.\n"
" Use regular `umount <MOUNTPOINT>` to unmount the FS.\n"
" The server will be automatically stopped when the FS is unmounted.\n"
"\n"
"vitastor-nfs (--fs <NAME> | --block) start\n"
" Start network NFS server. Options:\n"
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
" --port <PORT> use port <PORT> for NFS services (default is 2049)\n"
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
"\n"
"OPTIONS:\n"
" --fs <NAME> use VitastorFS with metadata in image <NAME>\n"
" --block use pseudo-FS presenting images as files\n"
" --pool <POOL> use <POOL> as default pool for new files\n"
" --subdir <DIR> export <DIR> instead of root directory\n"
" --nfspath <PATH> set NFS export path to <PATH> (default is /)\n"
" --pidfile <FILE> write process ID to the specified file\n"
" --logfile <FILE> log to the specified file\n"
" --foreground 1 stay in foreground, do not daemonize\n"
"\n"
"NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n"
"you do not use client_enable_writeback=true, so you can freely use multiple\n"
"NFS proxies with L3 load balancing in this case.\n"
"\n"
"Example start and mount commands for a custom NFS port:\n"
" vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n"
" mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp\n"
"Or just:\n"
" vitastor-nfs mount --block --pool testpool /mnt/\n"
;
json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
std::vector<std::string> cmd;
for (int i = 1; i < narg; i++)
{
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
{
printf(
"Vitastor NFS 3.0 proxy\n"
"(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n"
"\n"
"USAGE:\n"
" %s [STANDARD OPTIONS] [OTHER OPTIONS]\n"
" --fs <META> mount VitastorFS with metadata in image <META>\n"
" --subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)\n"
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
" --nfspath <PATH> set NFS export path to <PATH> (default is /)\n"
" --port <PORT> use port <PORT> for NFS services (default is 2049)\n"
" --pool <POOL> use <POOL> as default pool for new files (images)\n"
" --logfile <FILE> log to the specified file\n"
" --foreground 1 stay in foreground, do not daemonize\n"
"\n"
"NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n"
"you do not use client_enable_writeback=true, so you can freely use multiple\n"
"NFS proxies with L3 load balancing in this case.\n"
"\n"
"Example start and mount commands for a custom NFS port:\n"
" %s --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n"
" mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp\n",
exe_name, exe_name
);
printf("%s", help_text);
exit(0);
}
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "block") || i == narg-1 ? "1" : args[++i];
}
else
{
cmd.push_back(args[i]);
}
}
if (cfg.find("block") == cfg.end() && cfg.find("fs") == cfg.end())
{
fprintf(stderr, "Specify one of --block or --fs NAME. Use vitastor-nfs --help for details\n");
exit(1);
}
if (cmd.size() >= 2 && cmd[0] == "mount")
{
cfg["mount"] = cmd[1];
}
else if (cmd.size() >= 1 && cmd[0] == "start")
{
}
else
{
printf("%s", help_text);
exit(1);
}
return cfg;
}
@ -101,6 +140,7 @@ void nfs_proxy_t::run(json11::Json cfg)
// Parse options
if (cfg["logfile"].string_value() != "")
logfile = cfg["logfile"].string_value();
pidfile = cfg["pidfile"].string_value();
trace = cfg["log_level"].uint64_value() > 5 || cfg["trace"].uint64_value() > 0;
bind_address = cfg["bind"].string_value();
if (bind_address == "")
@ -113,18 +153,6 @@ void nfs_proxy_t::run(json11::Json cfg)
export_root = cfg["nfspath"].string_value();
if (!export_root.size())
export_root = "/";
name_prefix = cfg["subdir"].string_value();
{
int e = name_prefix.size();
while (e > 0 && name_prefix[e-1] == '/')
e--;
int s = 0;
while (s < e && name_prefix[s] == '/')
s++;
name_prefix = name_prefix.substr(s, e-s);
if (name_prefix.size())
name_prefix += "/";
}
if (cfg["client_writeback_allowed"].is_null())
{
// NFS is always aware of fsync, so we allow write-back cache
@ -133,6 +161,15 @@ void nfs_proxy_t::run(json11::Json cfg)
obj["client_writeback_allowed"] = true;
cfg = obj;
}
mountpoint = cfg["mount"].string_value();
if (mountpoint != "")
{
bind_address = "127.0.0.1";
nfs_port = 0;
portmap_enabled = false;
exit_on_umount = true;
}
fsname = cfg["fs"].string_value();
// Create client
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
epmgr = new epoll_manager_t(ringloop);
@ -142,11 +179,6 @@ void nfs_proxy_t::run(json11::Json cfg)
cmd->epmgr = epmgr;
cmd->cli = cli;
watch_stats();
if (!fs_kv_inode)
{
blockfs = new block_fs_state_t();
blockfs->init(this);
}
// Load image metadata
while (!cli->is_ready())
{
@ -158,70 +190,15 @@ void nfs_proxy_t::run(json11::Json cfg)
// Check default pool
check_default_pool();
// Check if we're using VitastorFS
fs_kv_inode = cfg["fs"].uint64_value();
if (fs_kv_inode)
if (fsname == "")
{
if (!INODE_POOL(fs_kv_inode))
{
fprintf(stderr, "FS metadata inode number must include pool\n");
exit(1);
}
blockfs = new block_fs_state_t();
blockfs->init(this, cfg);
}
else if (cfg["fs"].is_string())
else
{
for (auto & ic: cli->st_cli.inode_config)
{
if (ic.second.name == cfg["fs"].string_value())
{
fs_kv_inode = ic.first;
break;
}
}
if (!fs_kv_inode)
{
fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str());
exit(1);
}
}
readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value();
if (!readdir_getattr_parallel)
readdir_getattr_parallel = 8;
id_alloc_batch_size = cfg["id_alloc_batch_size"].uint64_value();
if (!id_alloc_batch_size)
id_alloc_batch_size = 200;
if (fs_kv_inode)
{
// Open DB and wait
int open_res = 0;
bool open_done = false;
db = new kv_dbw_t(cli);
db->open(fs_kv_inode, cfg, [&](int res)
{
open_done = true;
open_res = res;
});
while (!open_done)
{
ringloop->loop();
if (open_done)
break;
ringloop->wait();
}
if (open_res < 0)
{
fprintf(stderr, "Failed to open key/value filesystem metadata index: %s (code %d)\n",
strerror(-open_res), open_res);
exit(1);
}
fs_base_inode = ((uint64_t)default_pool_id << (64-POOL_ID_BITS));
fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1;
shared_inode_threshold = pool_block_size;
if (!cfg["shared_inode_threshold"].is_null())
{
shared_inode_threshold = cfg["shared_inode_threshold"].uint64_value();
}
kvfs = new kv_fs_state_t;
kvfs->zero_block.resize(pool_block_size);
kvfs = new kv_fs_state_t();
kvfs->init(this, cfg);
}
// Self-register portmap and NFS
pmap.reg_ports.insert((portmap_id_t){
@ -253,7 +230,7 @@ void nfs_proxy_t::run(json11::Json cfg)
.addr = "0.0.0.0.0."+std::to_string(nfs_port),
});
// Create NFS socket and add it to epoll
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, NULL);
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, &listening_port);
fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
{
@ -285,24 +262,43 @@ void nfs_proxy_t::run(json11::Json cfg)
}
});
}
if (mountpoint != "")
{
mount_fs();
}
if (cfg["foreground"].is_null())
{
daemonize();
}
while (true)
if (pidfile != "")
{
write_pid();
}
while (!finished)
{
ringloop->loop();
ringloop->wait();
}
// Destroy the client
cli->flush();
delete kvfs;
delete db;
if (kvfs)
{
delete kvfs;
kvfs = NULL;
}
if (blockfs)
{
delete blockfs;
blockfs = NULL;
}
if (db)
{
delete db;
db = NULL;
}
delete cli;
delete epmgr;
delete ringloop;
kvfs = NULL;
db = NULL;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
@ -376,7 +372,7 @@ void nfs_proxy_t::parse_stats(etcd_kv_t & kv)
inode_t inode_num = 0;
char null_byte = 0;
int scanned = sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+13, "%u/%ju%c", &pool_id, &inode_num, &null_byte);
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num)
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX)
{
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
}
@ -410,8 +406,6 @@ void nfs_proxy_t::check_default_pool()
auto pool_it = cli->st_cli.pool_config.begin();
default_pool_id = pool_it->first;
default_pool = pool_it->second.name;
pool_block_size = pool_it->second.pg_stripe_size;
pool_alignment = pool_it->second.bitmap_granularity;
}
else
{
@ -426,8 +420,6 @@ void nfs_proxy_t::check_default_pool()
if (p.second.name == default_pool)
{
default_pool_id = p.first;
pool_block_size = p.second.pg_stripe_size;
pool_alignment = p.second.bitmap_granularity;
break;
}
}
@ -446,12 +438,14 @@ void nfs_proxy_t::do_accept(int listen_fd)
int nfs_fd = 0;
while ((nfs_fd = accept(listen_fd, (struct sockaddr *)&addr, &addr_size)) >= 0)
{
fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str());
if (trace)
fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str());
active_connections++;
fcntl(nfs_fd, F_SETFL, fcntl(nfs_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(nfs_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
auto cli = new nfs_client_t();
if (fs_kv_inode)
if (kvfs)
nfs_kv_procs(cli);
else
nfs_block_procs(cli);
@ -466,8 +460,12 @@ void nfs_proxy_t::do_accept(int listen_fd)
// Handle incoming event
if (epoll_events & EPOLLRDHUP)
{
fprintf(stderr, "Client %d disconnected\n", nfs_fd);
auto parent = cli->parent;
if (parent->trace)
fprintf(stderr, "Client %d disconnected\n", nfs_fd);
cli->stop();
parent->active_connections--;
parent->check_exit();
return;
}
cli->epoll_events |= epoll_events;
@ -1006,6 +1004,113 @@ void nfs_proxy_t::daemonize()
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
}
void nfs_proxy_t::write_pid()
{
int fd = open(pidfile.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
if (fd < 0)
{
fprintf(stderr, "Failed to create pid file %s: %s (code %d)\n", pidfile.c_str(), strerror(errno), errno);
return;
}
auto pid = std::to_string(getpid());
if (write(fd, pid.c_str(), pid.size()) < 0)
{
fprintf(stderr, "Failed to write pid to %s: %s (code %d)\n", pidfile.c_str(), strerror(errno), errno);
}
close(fd);
}
static pid_t wanted_pid = 0;
static bool child_finished = false;
static int child_status = -1;
void single_child_handler(int signal)
{
child_finished = true;
waitpid(wanted_pid, &child_status, WNOHANG);
}
void nfs_proxy_t::mount_fs()
{
signal(SIGCHLD, single_child_handler);
auto pid = fork();
if (pid < 0)
{
fprintf(stderr, "Failed to fork: %s (code %d)\n", strerror(errno), errno);
exit(1);
}
if (pid > 0)
{
// Parent - loop and wait until child finishes
wanted_pid = pid;
while (!child_finished)
{
ringloop->loop();
ringloop->wait();
}
if (!WIFEXITED(child_status) || WEXITSTATUS(child_status) != 0)
{
// Mounting failed
exit(1);
}
if (fsname != "")
fprintf(stderr, "Successfully mounted VitastorFS %s at %s\n", fsname.c_str(), mountpoint.c_str());
else
fprintf(stderr, "Successfully mounted Vitastor pseudo-FS at %s\n", mountpoint.c_str());
}
else
{
// Child
std::string src = ("localhost:"+export_root);
std::string opts = ("port="+std::to_string(listening_port)+",mountport="+std::to_string(listening_port)+",nfsvers=3,soft,nolock,tcp");
const char *args[] = { "mount", src.c_str(), mountpoint.c_str(), "-o", opts.c_str(), NULL };
execvp("mount", (char* const*)args);
fprintf(stderr, "Failed to run mount %s %s -o %s: %s (code %d)\n",
src.c_str(), mountpoint.c_str(), opts.c_str(), strerror(errno), errno);
exit(1);
}
}
void nfs_proxy_t::check_exit()
{
if (active_connections || !exit_on_umount)
{
return;
}
std::string mountstr = read_file("/proc/mounts");
if (mountstr == "")
{
return;
}
auto port_opt = "port="+std::to_string(listening_port);
auto mountport_opt = "port="+std::to_string(listening_port);
auto mounts = explode("\n", mountstr, true);
for (auto & str: mounts)
{
auto opts = explode(" ", str, true);
if (opts[2].size() >= 3 && opts[2].substr(0, 3) == "nfs" && opts.size() >= 4)
{
opts = explode(",", opts[3], true);
bool port_found = false;
bool addr_found = false;
for (auto & opt: opts)
{
if (opt == port_opt || opt == mountport_opt)
port_found = true;
if (opt == "addr=127.0.0.1" || opt == "mountaddr=127.0.0.1")
addr_found = true;
}
if (port_found && addr_found)
{
// OK, do not unmount
return;
}
}
}
// Not found, unmount
finished = true;
}
int main(int narg, const char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);

View File

@ -21,24 +21,23 @@ class nfs_proxy_t
{
public:
std::string bind_address;
std::string name_prefix;
uint64_t fsid = 1;
uint64_t server_id = 0;
std::string default_pool;
std::string export_root;
bool portmap_enabled;
unsigned nfs_port;
uint64_t fs_kv_inode = 0;
uint64_t fs_base_inode = 0;
uint64_t fs_inode_count = 0;
int readdir_getattr_parallel = 8, id_alloc_batch_size = 200;
int trace = 0;
std::string logfile = "/dev/null";
std::string pidfile;
bool exit_on_umount = false;
std::string mountpoint;
std::string fsname;
pool_id_t default_pool_id;
uint64_t pool_block_size = 0;
uint64_t pool_alignment = 0;
uint64_t shared_inode_threshold = 0;
int active_connections = 0;
bool finished = false;
int listening_port = 0;
pool_id_t default_pool_id = 0;
portmap_service_t pmap;
ring_loop_t *ringloop = NULL;
@ -65,6 +64,9 @@ public:
void check_default_pool();
void do_accept(int listen_fd);
void daemonize();
void write_pid();
void mount_fs();
void check_exit();
};
struct rpc_cur_buffer_t

View File

@ -1,9 +1,10 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include <assert.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include "str_util.h"
std::string base64_encode(const std::string &in)
@ -304,6 +305,23 @@ std::string read_all_fd(int fd)
return res;
}
std::string read_file(std::string file, bool allow_enoent)
{
std::string res;
int fd = open(file.c_str(), O_RDONLY);
if (fd < 0 || (res = read_all_fd(fd)) == "")
{
int err = errno;
if (fd >= 0)
close(fd);
if (!allow_enoent || err != ENOENT)
fprintf(stderr, "Failed to read %s: %s (code %d)\n", file.c_str(), strerror(err), err);
return "";
}
close(fd);
return res;
}
std::string str_repeat(const std::string & str, int times)
{
std::string r;

View File

@ -1,5 +1,5 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#pragma once
#include <stdint.h>
@ -18,6 +18,7 @@ std::string format_size(uint64_t size, bool nobytes = false);
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
uint64_t parse_time(std::string time_str, bool *ok = NULL);
std::string read_all_fd(int fd);
std::string read_file(std::string file, bool allow_enoent = false);
std::string str_repeat(const std::string & str, int times);
size_t utf8_length(const std::string & s);
size_t utf8_length(const char *s);

View File

@ -4,7 +4,7 @@ PG_COUNT=16
. `dirname $0`/run_3osds.sh
build/src/vitastor-cli --etcd_address $ETCD_URL create -s 10G fsmeta
build/src/vitastor-nfs --fs fsmeta --etcd_address $ETCD_URL --portmap 0 --port 2050 --foreground 1 --trace 1 >>./testdata/nfs.log 2>&1 &
build/src/vitastor-nfs start --fs fsmeta --etcd_address $ETCD_URL --portmap 0 --port 2050 --foreground 1 --trace 1 >>./testdata/nfs.log 2>&1 &
NFS_PID=$!
mkdir -p testdata/nfs