diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7b7580f8..06027af8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -213,7 +213,19 @@ target_link_libraries(vitastor-kv-stress # vitastor-nfs add_executable(vitastor-nfs nfs_proxy.cpp - nfs_conn.cpp + nfs_kv.cpp + nfs_kv_create.cpp + nfs_kv_fsstat.cpp + nfs_kv_getattr.cpp + nfs_kv_link.cpp + nfs_kv_lookup.cpp + nfs_kv_read.cpp + nfs_kv_readdir.cpp + nfs_kv_remove.cpp + nfs_kv_rename.cpp + nfs_kv_setattr.cpp + nfs_kv_write.cpp + nfs_mount.cpp nfs_portmap.cpp sha256.c nfs/xdr_impl.cpp diff --git a/src/nfs_conn.cpp b/src/nfs_conn.cpp deleted file mode 100644 index b1d0ed7f..00000000 --- a/src/nfs_conn.cpp +++ /dev/null @@ -1,2474 +0,0 @@ -// Copyright (c) Vitaliy Filippov, 2019+ -// License: VNPL-1.1 (see README.md for details) -// -// NFS connection handler for NFS proxy over VitastorKV database - -#include - -#include "str_util.h" - -#include "nfs_proxy.h" - -#include "nfs/nfs.h" - -#include "cli.h" - -#define TRUE 1 -#define FALSE 0 - -#define MAX_REQUEST_SIZE 128*1024*1024 - -static unsigned len_pad4(unsigned len) -{ - return len + (len&3 ? 4-(len&3) : 0); -} - -static nfsstat3 vitastor_nfs_map_err(int err) -{ - return (err == EINVAL ? NFS3ERR_INVAL - : (err == ENOENT ? NFS3ERR_NOENT - : (err == ENOSPC ? NFS3ERR_NOSPC - : (err == EEXIST ? NFS3ERR_EXIST - : (err == EIO ? NFS3ERR_IO : (err ? NFS3ERR_IO : NFS3_OK)))))); -} - -static int nfs3_null_proc(void *opaque, rpc_op_t *rop) -{ - rpc_queue_reply(rop); - return 0; -} - -static nfstime3 nfstime_from_str(const std::string & s) -{ - nfstime3 t; - auto p = s.find("."); - if (p != std::string::npos) - { - t.seconds = stoull_full(s.substr(0, p), 10); - t.nseconds = stoull_full(s.substr(p+1), 10); - p = s.size()-p-1; - for (; p < 9; p++) - t.nseconds *= 10; - for (; p > 9; p--) - t.nseconds /= 10; - } - else - t.seconds = stoull_full(s, 10); - return t; -} - -static std::string nfstime_to_str(nfstime3 t) -{ - char buf[32]; - snprintf(buf, sizeof(buf), "%u.%09u", t.seconds, t.nseconds); - return buf; -} - -static int kv_map_type(const std::string & type) -{ - return (type == "" || type == "file" ? NF3REG : - (type == "dir" ? NF3DIR : - (type == "blk" ? NF3BLK : - (type == "chr" ? NF3CHR : - (type == "link" ? NF3LNK : - (type == "sock" ? NF3SOCK : - (type == "fifo" ? NF3FIFO : -1))))))); -} - -static fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs) -{ - auto type = kv_map_type(attrs["type"].string_value()); - auto mode = attrs["mode"].uint64_value(); - auto nlink = attrs["nlink"].uint64_value(); - nfstime3 mtime = nfstime_from_str(attrs["mtime"].string_value()); - nfstime3 atime = attrs["atime"].is_null() ? mtime : nfstime_from_str(attrs["atime"].string_value()); - // FIXME In theory we could store the binary structure itself instead of JSON - return (fattr3){ - .type = (type == 0 ? NF3REG : (ftype3)type), - .mode = (attrs["mode"].is_null() ? (type == NF3DIR ? 0755 : 0644) : (uint32_t)mode), - .nlink = (nlink == 0 ? 1 : (uint32_t)nlink), - .uid = (uint32_t)attrs["uid"].uint64_value(), - .gid = (uint32_t)attrs["gid"].uint64_value(), - .size = (type == NF3DIR ? 4096 : attrs["size"].uint64_value()), - .used = (type == NF3DIR ? 4096 : attrs["alloc"].uint64_value()), - .rdev = (type == NF3BLK || type == NF3CHR - ? (specdata3){ (uint32_t)attrs["major"].uint64_value(), (uint32_t)attrs["minor"].uint64_value() } - : (specdata3){}), - .fsid = self->parent->fsid, - .fileid = ino, - .atime = atime, - .mtime = mtime, - .ctime = mtime, - }; -} - -static std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename) -{ - // encode as: d / - char key[24] = { 0 }; - snprintf(key, sizeof(key), "d-%jx/", dir_ino); - int n = strnlen(key, sizeof(key)-1) - 3; - if (n < 10) - key[1] = '0'+n; - else - key[1] = 'A'+(n-10); - return (char*)key + filename; -} - -static uint64_t kv_root_inode = 1; - -static const char *kv_next_id_key = "id"; - -static const char *kv_root_handle = "R"; - -static std::string kv_direntry_filename(const std::string & key) -{ - // decode as: d / - auto pos = key.find("/"); - if (pos != std::string::npos) - return key.substr(pos+1); - return key; -} - -static std::string kv_inode_key(uint64_t ino) -{ - char key[24] = { 0 }; - snprintf(key, sizeof(key), "i-%jx", ino); - int n = strnlen(key, sizeof(key)-1) - 2; - if (n < 10) - key[1] = '0'+n; - else - key[1] = 'A'+(n-10); - return std::string(key, n+2); -} - -static std::string kv_fh(uint64_t ino) -{ - return "S"+std::string((char*)&ino, 8); -} - -static uint64_t kv_fh_inode(const std::string & fh) -{ - if (fh.size() == 1 && fh[0] == 'R') - { - return 1; - } - else if (fh.size() == 9 && fh[0] == 'S') - { - return *(uint64_t*)&fh[1]; - } - else if (fh.size() > 17 && fh[0] == 'I') - { - return *(uint64_t*)&fh[fh.size()-8]; - } - return 0; -} - -static bool kv_fh_valid(const std::string & fh) -{ - return fh == kv_root_handle || fh.size() == 9 && fh[0] == 'S' || fh.size() > 17 && fh[0] == 'I'; -} - -// Attributes are always stored in the inode -static void kv_read_inode(nfs_client_t *self, uint64_t ino, - std::function cb) -{ - auto key = kv_inode_key(ino); - self->parent->db->get(key, [=](int res, const std::string & value) - { - if (ino == kv_root_inode && res == -ENOENT) - { - // Allow root inode to not exist - cb(0, "", json11::Json(json11::Json::object{ { "type", "dir" } })); - return; - } - if (res < 0) - { - if (res != -ENOENT) - fprintf(stderr, "Error reading inode %s: %s (code %d)\n", kv_inode_key(ino).c_str(), strerror(-res), res); - cb(res, "", json11::Json()); - return; - } - std::string err; - auto attrs = json11::Json::parse(value, err); - if (err != "") - { - fprintf(stderr, "Invalid JSON in inode %s = %s: %s\n", kv_inode_key(ino).c_str(), value.c_str(), err.c_str()); - res = -EIO; - } - cb(res, value, attrs); - }); -} - -static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - GETATTR3args *args = (GETATTR3args*)rop->request; - GETATTR3res *reply = (GETATTR3res*)rop->reply; - std::string fh = args->object; - auto ino = kv_fh_inode(fh); - if (self->parent->trace) - fprintf(stderr, "[%d] GETATTR %ju\n", self->nfs_fd, ino); - if (!kv_fh_valid(fh)) - { - *reply = (GETATTR3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - return 0; - } - kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json attrs) - { - if (res < 0) - { - *reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) }; - } - else - { - *reply = (GETATTR3res){ - .status = NFS3_OK, - .resok = (GETATTR3resok){ - .obj_attributes = get_kv_attributes(self, ino, attrs), - }, - }; - } - rpc_queue_reply(rop); - }); - return 1; -} - -struct nfs_kv_setattr_state -{ - nfs_client_t *self = NULL; - rpc_op_t *rop = NULL; - uint64_t ino = 0; - uint64_t old_size = 0, new_size = 0; - json11::Json::object set_attrs; - int res = 0, cas_res = 0; - std::string ientry_text; - json11::Json ientry; - json11::Json::object new_attrs; - std::function cb; -}; - -static void nfs_kv_continue_setattr(nfs_kv_setattr_state *st, int state) -{ - if (state == 0) {} - else if (state == 1) goto resume_1; - else if (state == 2) goto resume_2; - else if (state == 3) goto resume_3; - else - { - fprintf(stderr, "BUG: invalid state in nfs_kv_continue_setattr()"); - abort(); - } -resume_0: - kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs) - { - st->res = res; - st->ientry_text = value; - st->ientry = attrs; - nfs_kv_continue_setattr(st, 1); - }); - return; -resume_1: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - if (st->ientry["type"].string_value() == "link" || - st->ientry["type"].string_value() != "file" && - st->ientry["type"].string_value() != "" && - !st->set_attrs["size"].is_null()) - { - auto cb = std::move(st->cb); - cb(-EINVAL); - return; - } - // Now we can update it - st->new_attrs = st->ientry.object_items(); - st->old_size = st->ientry["size"].uint64_value(); - for (auto & kv: st->set_attrs) - { - if (kv.first == "size") - { - st->new_size = kv.second.uint64_value(); - } - st->new_attrs[kv.first] = kv.second; - } - st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(st->new_attrs).dump(), [st](int res) - { - st->res = res; - nfs_kv_continue_setattr(st, 2); - }, [st](int res, const std::string & cas_value) - { - st->cas_res = res; - return (res == 0 || res == -ENOENT && st->ino == kv_root_inode) && cas_value == st->ientry_text; - }); - return; -resume_2: - if (st->cas_res == -ENOENT) - { - st->res = -ENOENT; - } - if (st->res == -EAGAIN) - { - // Retry - fprintf(stderr, "CAS failure during setattr, retrying\n"); - goto resume_0; - } - if (st->res < 0) - { - fprintf(stderr, "Failed to update inode %ju: %s (code %d)\n", st->ino, strerror(-st->res), st->res); - auto cb = std::move(st->cb); - cb(st->res); - return; - } - if (!st->set_attrs["size"].is_null() && - st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value()) - { - // Delete extra data when downsizing - st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { - { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) }, - { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) }, - { "min_offset", st->set_attrs["size"].uint64_value() }, - }), [st](const cli_result_t & r) - { - if (r.err) - { - fprintf(stderr, "Failed to truncate inode %ju: %s (code %d)\n", - st->ino, r.text.c_str(), r.err); - } - st->res = r.err; - nfs_kv_continue_setattr(st, 3); - }); - return; - } -resume_3: - auto cb = std::move(st->cb); - cb(0); -} - -static int nfs3_setattr_proc(void *opaque, rpc_op_t *rop) -{ - nfs_kv_setattr_state *st = new nfs_kv_setattr_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - auto args = (SETATTR3args*)rop->request; - auto reply = (SETATTR3res*)rop->reply; - std::string fh = args->object; - if (!kv_fh_valid(fh)) - { - *reply = (SETATTR3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - delete st; - return 0; - } - st->ino = kv_fh_inode(fh); - if (args->new_attributes.size.set_it) - st->set_attrs["size"] = args->new_attributes.size.size; - if (args->new_attributes.mode.set_it) - st->set_attrs["mode"] = (uint64_t)args->new_attributes.mode.mode; - if (args->new_attributes.uid.set_it) - st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid; - if (args->new_attributes.gid.set_it) - st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid; - if (args->new_attributes.atime.set_it) - st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime); - if (args->new_attributes.mtime.set_it) - st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime); - fprintf(stderr, "SETATTR %ju ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str()); - st->cb = [st](int res) - { - auto reply = (SETATTR3res*)st->rop->reply; - if (res < 0) - { - *reply = (SETATTR3res){ - .status = vitastor_nfs_map_err(res), - }; - } - else - { - *reply = (SETATTR3res){ - .status = NFS3_OK, - .resok = (SETATTR3resok){ - .obj_wcc = (wcc_data){ - .after = (post_op_attr){ - .attributes_follow = 1, - .attributes = get_kv_attributes(st->self, st->ino, st->new_attrs), - }, - }, - }, - }; - } - rpc_queue_reply(st->rop); - delete st; - }; - nfs_kv_continue_setattr(st, 0); - return 1; -} - -static int nfs3_lookup_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - LOOKUP3args *args = (LOOKUP3args*)rop->request; - LOOKUP3res *reply = (LOOKUP3res*)rop->reply; - inode_t dir_ino = kv_fh_inode(args->what.dir); - std::string filename = args->what.name; - if (self->parent->trace) - fprintf(stderr, "[%d] LOOKUP %ju/%s\n", self->nfs_fd, dir_ino, filename.c_str()); - if (!dir_ino || filename == "") - { - *reply = (LOOKUP3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - return 0; - } - self->parent->db->get(kv_direntry_key(dir_ino, filename), [=](int res, const std::string & value) - { - if (res < 0) - { - *reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(-res) }; - rpc_queue_reply(rop); - return; - } - std::string err; - auto direntry = json11::Json::parse(value, err); - if (err != "") - { - fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n", kv_direntry_key(dir_ino, filename).c_str(), value.c_str(), err.c_str()); - *reply = (LOOKUP3res){ .status = NFS3ERR_IO }; - rpc_queue_reply(rop); - return; - } - uint64_t ino = direntry["ino"].uint64_value(); - kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json ientry) - { - if (res < 0) - { - *reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(res == -ENOENT ? -EIO : res) }; - rpc_queue_reply(rop); - return; - } - *reply = (LOOKUP3res){ - .status = NFS3_OK, - .resok = (LOOKUP3resok){ - .object = xdr_copy_string(rop->xdrs, kv_fh(ino)), - .obj_attributes = { - .attributes_follow = 1, - .attributes = get_kv_attributes(self, ino, ientry), - }, - }, - }; - rpc_queue_reply(rop); - }); - }); - return 1; -} - -static int nfs3_access_proc(void *opaque, rpc_op_t *rop) -{ - //nfs_client_t *self = (nfs_client_t*)opaque; - ACCESS3args *args = (ACCESS3args*)rop->request; - ACCESS3res *reply = (ACCESS3res*)rop->reply; - *reply = (ACCESS3res){ - .status = NFS3_OK, - .resok = (ACCESS3resok){ - .access = args->access, - }, - }; - rpc_queue_reply(rop); - return 0; -} - -static int nfs3_readlink_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - READLINK3args *args = (READLINK3args*)rop->request; - if (self->parent->trace) - fprintf(stderr, "[%d] READLINK %ju\n", self->nfs_fd, kv_fh_inode(args->symlink)); - READLINK3res *reply = (READLINK3res*)rop->reply; - if (!kv_fh_valid(args->symlink) || args->symlink == kv_root_handle) - { - // Invalid filehandle or trying to read symlink from root entry - *reply = (READLINK3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - return 0; - } - kv_read_inode(self, kv_fh_inode(args->symlink), [=](int res, const std::string & value, json11::Json attrs) - { - if (res < 0) - { - *reply = (READLINK3res){ .status = vitastor_nfs_map_err(-res) }; - } - else if (attrs["type"] != "link") - { - *reply = (READLINK3res){ .status = NFS3ERR_INVAL }; - } - else - { - *reply = (READLINK3res){ - .status = NFS3_OK, - .resok = (READLINK3resok){ - .data = xdr_copy_string(rop->xdrs, attrs["symlink"].string_value()), - }, - }; - } - rpc_queue_reply(rop); - }); - return 1; -} - -static int nfs3_read_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - READ3args *args = (READ3args*)rop->request; - READ3res *reply = (READ3res*)rop->reply; - inode_t ino = kv_fh_inode(args->file); - if (args->count > MAX_REQUEST_SIZE || !ino) - { - *reply = (READ3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - return 0; - } - uint64_t alignment = self->parent->cli->st_cli.global_bitmap_granularity; - auto pool_cfg = self->parent->cli->st_cli.pool_config.find(INODE_POOL(self->parent->fs_base_inode)); - if (pool_cfg != self->parent->cli->st_cli.pool_config.end()) - { - alignment = pool_cfg->second.bitmap_granularity; - } - uint64_t aligned_offset = args->offset - (args->offset % alignment); - uint64_t aligned_count = args->offset + args->count; - if (aligned_count % alignment) - aligned_count = aligned_count + alignment - (aligned_count % alignment); - aligned_count -= aligned_offset; - void *buf = malloc_or_die(aligned_count); - xdr_add_malloc(rop->xdrs, buf); - cluster_op_t *op = new cluster_op_t; - op->opcode = OSD_OP_READ; - op->inode = self->parent->fs_base_inode + ino; - op->offset = aligned_offset; - op->len = aligned_count; - op->iov.push_back(buf, aligned_count); - *reply = (READ3res){ .status = NFS3_OK }; - reply->resok.data.data = (char*)buf + args->offset - aligned_offset; - reply->resok.data.size = args->count; - op->callback = [rop](cluster_op_t *op) - { - READ3res *reply = (READ3res*)rop->reply; - if (op->retval != op->len) - { - *reply = (READ3res){ .status = vitastor_nfs_map_err(-op->retval) }; - } - else - { - auto & reply_ok = reply->resok; - // reply_ok.data.data is already set above - reply_ok.count = reply_ok.data.size; - reply_ok.eof = 0; - } - rpc_queue_reply(rop); - delete op; - }; - self->parent->cli->execute(op); - return 1; -} - -static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf); - -static int nfs3_write_proc(void *opaque, rpc_op_t *rop) -{ - // FIXME: Pack small files into "shared inodes" - // Insane algorithm... - // Threshold should be equal to one allocation block of the pool - // Write: - // - If (offset+size <= threshold): - // - Read inode from cache - // - If inode does not exist - stop with -ENOENT - // - If it's empty (size == 0 || empty == true): - // - If preset size is larger than threshold: - // - Write data into non-shared inode - // - In parallel: clear empty flag - // - If CAS failure: re-read inode and restart - // - Otherwise: - // - Allocate/take a shared inode - // - Allocate space in its end - // - Write data into shared inode - // - If CAS failure: allocate another shared inode and retry - // - Write shared inode reference, set size - // - If CAS failure: free allocated shared space, re-read inode and restart - // - If it's not empty: - // - If non-shared: - // - Write data into non-shared inode - // - In parallel: check if data fits into inode size and extend if it doesn't - // - If CAS failure: re-read inode and retry to extend the size - // - If shared: - // - Read whole file from shared inode - // - If the file header in data doesn't match: re-read inode and restart - // - If data doesn't fit into the same shared inode: - // - Allocate space in a new shared inode - // - Write data into the new shared inode - // - If CAS failure: allocate another shared inode and retry - // - Update inode metadata (set new size and new shared inode) - // - If CAS failure: free allocated shared space, re-read inode and restart - // - If it fits: - // - Write updated data into the shared inode - // - Update shared inode header - // - If CAS failure: re-read inode and restart - // - Otherwise: - // - Write data into non-shared inode - // - Read inode in parallel - // - If shared: - // - Read whole file from shared inode - // - Write data into non-shared inode - // - If CAS failure (block should not exist): restart - // - Update inode metadata (make non-shared, update size) - // - If CAS failure: restart - // - Zero out the shared inode header - // - If CAS failure: restart - // - Check if size fits - // - Extend if it doesn't - // Read: - // - If (offset+size <= threshold): - // - Read inode from cache - // - If empty: return zeroes - // - If shared: - // - Read the whole file from shared inode, or at least data and shared inode header - // - If the file header in data doesn't match: re-read inode and restart - // - If non-shared: - // - Read data from non-shared inode - // - Otherwise: - // - Read data from non-shared inode - nfs_client_t *self = (nfs_client_t*)opaque; - WRITE3args *args = (WRITE3args*)rop->request; - WRITE3res *reply = (WRITE3res*)rop->reply; - inode_t ino = kv_fh_inode(args->file); - if (!ino) - { - *reply = (WRITE3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - return 0; - } - if (args->count > MAX_REQUEST_SIZE) - { - *reply = (WRITE3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - return 0; - } - uint64_t count = args->count > args->data.size ? args->data.size : args->count; - uint64_t alignment = self->parent->cli->st_cli.global_bitmap_granularity; - auto pool_cfg = self->parent->cli->st_cli.pool_config.find(INODE_POOL(self->parent->fs_base_inode)); - if (pool_cfg != self->parent->cli->st_cli.pool_config.end()) - { - alignment = pool_cfg->second.bitmap_granularity; - } - // Pre-fill reply - *reply = (WRITE3res){ - .status = NFS3_OK, - .resok = (WRITE3resok){ - //.file_wcc = ..., - .count = (unsigned)count, - }, - }; - if ((args->offset % alignment) != 0 || (count % alignment) != 0) - { - // Unaligned write, requires read-modify-write - uint64_t aligned_offset = args->offset - (args->offset % alignment); - uint64_t aligned_count = args->offset + args->count; - if (aligned_count % alignment) - aligned_count = aligned_count + alignment - (aligned_count % alignment); - aligned_count -= aligned_offset; - void *buf = malloc_or_die(aligned_count); - xdr_add_malloc(rop->xdrs, buf); - // Read - cluster_op_t *op = new cluster_op_t; - op->opcode = OSD_OP_READ; - op->inode = self->parent->fs_base_inode + ino; - op->offset = aligned_offset; - op->len = aligned_count; - op->iov.push_back(buf, aligned_count); - op->callback = [self, rop, count](cluster_op_t *op) - { - if (op->retval != op->len) - { - WRITE3res *reply = (WRITE3res*)rop->reply; - *reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) }; - rpc_queue_reply(rop); - return; - } - void *buf = op->iov.buf[0].iov_base; - WRITE3args *args = (WRITE3args*)rop->request; - memcpy((uint8_t*)buf + args->offset - op->offset, args->data.data, count); - nfs_resize_write(self, rop, op->inode, args->offset+count, op->offset, op->len, buf); - delete op; - }; - self->parent->cli->execute(op); - } - else - { - nfs_resize_write(self, rop, ino, args->offset+count, args->offset, count, args->data.data); - } - return 1; -} - -static void complete_extend_write(nfs_client_t *self, rpc_op_t *rop, inode_t inode, int res) -{ - WRITE3args *args = (WRITE3args*)rop->request; - WRITE3res *reply = (WRITE3res*)rop->reply; - if (res < 0) - { - *reply = (WRITE3res){ .status = vitastor_nfs_map_err(res) }; - rpc_queue_reply(rop); - return; - } - bool imm = self->parent->cli->get_immediate_commit(inode); - reply->resok.committed = args->stable != UNSTABLE || imm ? FILE_SYNC : UNSTABLE; - *(uint64_t*)reply->resok.verf = self->parent->server_id; - if (args->stable != UNSTABLE && !imm) - { - // Client requested a stable write. Add an fsync - auto op = new cluster_op_t; - op->opcode = OSD_OP_SYNC; - op->callback = [rop](cluster_op_t *op) - { - if (op->retval != 0) - { - WRITE3res *reply = (WRITE3res*)rop->reply; - *reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) }; - } - delete op; - rpc_queue_reply(rop); - }; - self->parent->cli->execute(op); - } - else - { - rpc_queue_reply(rop); - } -} - -static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size, int err) -{ - auto ext_it = self->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 }); - while (ext_it != self->extend_writes.end() && - ext_it->first.inode == inode && - ext_it->first.new_size <= new_size) - { - ext_it->second.resize_res = err; - if (ext_it->second.write_res <= 0) - { - complete_extend_write(self, ext_it->second.rop, inode, ext_it->second.write_res < 0 - ? ext_it->second.write_res : ext_it->second.resize_res); - self->extend_writes.erase(ext_it++); - } - else - ext_it++; - } -} - -static void extend_inode(nfs_client_t *self, uint64_t inode) -{ - // Send an extend request - auto ext = &self->extends[inode]; - self->parent->db->set(kv_inode_key(inode), json11::Json(ext->attrs).dump().c_str(), [=](int res) - { - if (res < 0 && res != -EAGAIN) - { - fprintf(stderr, "Error extending inode %ju to %ju bytes: %s (code %d)\n", inode, ext->cur_extend, strerror(-res), res); - } - if (res == -EAGAIN || ext->next_extend > ext->cur_extend) - { - // Multiple concurrent resize requests received, try to repeat - kv_read_inode(self, inode, [=](int res, const std::string & value, json11::Json attrs) - { - auto new_size = ext->next_extend > ext->cur_extend ? ext->next_extend : ext->cur_extend; - if (res == 0 && attrs["size"].uint64_value() < new_size) - { - ext->old_ientry = value; - ext->attrs = attrs.object_items(); - ext->cur_extend = new_size; - ext->attrs["size"] = new_size; - extend_inode(self, inode); - } - else - { - if (res < 0) - { - fprintf(stderr, "Error extending inode %ju to %ju bytes: %s (code %d)\n", inode, ext->cur_extend, strerror(-res), res); - } - ext->cur_extend = ext->next_extend = 0; - complete_extend_inode(self, inode, attrs["size"].uint64_value(), res); - } - }); - return; - } - auto new_size = ext->cur_extend; - ext->cur_extend = ext->next_extend = 0; - complete_extend_inode(self, inode, new_size, res); - }); -} - -static void nfs_do_write(nfs_client_t *self, std::multimap::iterator ewr_it, - rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf) -{ - cluster_op_t *op = new cluster_op_t; - op->opcode = OSD_OP_WRITE; - op->inode = self->parent->fs_base_inode + inode; - op->offset = offset; - op->len = count; - op->iov.push_back(buf, count); - op->callback = [self, ewr_it, rop](cluster_op_t *op) - { - auto inode = op->inode; - int write_res = op->retval < 0 ? op->retval : (op->retval != op->len ? -ERANGE : 0); - if (ewr_it == self->extend_writes.end()) - { - complete_extend_write(self, rop, inode, write_res); - } - else - { - ewr_it->second.write_res = write_res; - if (ewr_it->second.resize_res <= 0) - { - complete_extend_write(self, rop, inode, write_res < 0 ? write_res : ewr_it->second.resize_res); - self->extend_writes.erase(ewr_it); - } - } - }; - self->parent->cli->execute(op); -} - -static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf) -{ - // Check if we have to resize the inode during write - kv_read_inode(self, inode, [=](int res, const std::string & value, json11::Json attrs) - { - if (res < 0) - { - *(WRITE3res*)rop->reply = (WRITE3res){ .status = vitastor_nfs_map_err(-res) }; - rpc_queue_reply(rop); - } - else if (kv_map_type(attrs["type"].string_value()) != NF3REG) - { - *(WRITE3res*)rop->reply = (WRITE3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - } - else if (attrs["size"].uint64_value() < new_size) - { - auto ewr_it = self->extend_writes.emplace((extend_size_t){ - .inode = inode, - .new_size = new_size, - }, (extend_write_t){ - .rop = rop, - .resize_res = 1, - .write_res = 1, - }); - auto ext = &self->extends[inode]; - if (ext->cur_extend > 0) - { - // Already resizing, just wait - if (ext->next_extend < new_size) - ext->next_extend = new_size; - } - else - { - ext->old_ientry = value; - ext->attrs = attrs.object_items(); - ext->cur_extend = new_size; - ext->attrs["size"] = new_size; - extend_inode(self, inode); - } - nfs_do_write(self, ewr_it, rop, inode, offset, count, buf); - } - else - { - nfs_do_write(self, self->extend_writes.end(), rop, inode, offset, count, buf); - } - }); -} - -static void allocate_new_id(nfs_client_t *self, std::function cb) -{ - if (self->parent->fs_next_id <= self->parent->fs_allocated_id) - { - cb(0, self->parent->fs_next_id++); - return; - } - else if (self->parent->fs_next_id > self->parent->fs_inode_count) - { - cb(-ENOSPC, 0); - return; - } - self->parent->db->get(kv_next_id_key, [=](int res, const std::string & prev_str) - { - if (res < 0 && res != -ENOENT) - { - cb(res, 0); - return; - } - uint64_t prev_val = stoull_full(prev_str); - if (prev_val >= self->parent->fs_inode_count) - { - cb(-ENOSPC, 0); - return; - } - if (prev_val < 1) - { - prev_val = 1; - } - uint64_t new_val = prev_val + self->parent->id_alloc_batch_size; - if (new_val >= self->parent->fs_inode_count) - { - new_val = self->parent->fs_inode_count; - } - self->parent->db->set(kv_next_id_key, std::to_string(new_val), [=](int res) - { - if (res == -EAGAIN) - { - // CAS failure - retry - allocate_new_id(self, cb); - } - else if (res < 0) - { - cb(res, 0); - } - else - { - self->parent->fs_next_id = prev_val+2; - self->parent->fs_allocated_id = new_val; - cb(0, prev_val+1); - } - }, [prev_val](int res, const std::string & value) - { - // FIXME: Allow to modify value from CAS callback? ("update" query) - return res < 0 || stoull_full(value) == prev_val; - }); - }); -} - -struct kv_create_state -{ - nfs_client_t *self = NULL; - rpc_op_t *rop = NULL; - bool exclusive = false; - uint64_t verf = 0; - uint64_t dir_ino = 0; - std::string filename; - uint64_t new_id = 0; - json11::Json::object attrobj; - json11::Json attrs; - std::string direntry_text; - uint64_t dup_ino = 0; - std::function cb; -}; - -static void kv_do_create(kv_create_state *st) -{ - if (st->self->parent->trace) - fprintf(stderr, "[%d] CREATE %ju/%s ATTRS %s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str(), json11::Json(st->attrobj).dump().c_str()); - if (st->filename == "" || st->filename.find("/") != std::string::npos) - { - auto cb = std::move(st->cb); - cb(-EINVAL); - return; - } - // Generate inode ID - allocate_new_id(st->self, [st](int res, uint64_t new_id) - { - if (res < 0) - { - auto cb = std::move(st->cb); - cb(res); - return; - } - st->new_id = new_id; - auto direntry = json11::Json::object{ { "ino", st->new_id } }; - if (st->attrobj["type"].string_value() == "dir") - { - direntry["type"] = "dir"; - } - st->attrs = std::move(st->attrobj); - st->direntry_text = json11::Json(direntry).dump().c_str(); - // Set direntry - st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename), st->direntry_text, [st](int res) - { - if (res < 0) - { - st->self->parent->unallocated_ids.push_back(st->new_id); - if (res == -EAGAIN) - { - if (st->dup_ino) - { - st->new_id = st->dup_ino; - res = 0; - } - else - res = -EEXIST; - } - else - fprintf(stderr, "create %ju/%s failed: %s (code %d)\n", st->dir_ino, st->filename.c_str(), strerror(-res), res); - auto cb = std::move(st->cb); - cb(res); - } - else - { - st->self->parent->db->set(kv_inode_key(st->new_id), st->attrs.dump().c_str(), [st](int res) - { - if (res == -EAGAIN) - { - res = -EEXIST; - } - if (res < 0) - { - st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st, res](int del_res) - { - if (!del_res) - { - st->self->parent->unallocated_ids.push_back(st->new_id); - } - auto cb = std::move(st->cb); - cb(res); - }, [st](int res, const std::string & value) - { - return res != -ENOENT && value == st->direntry_text; - }); - } - else - { - auto cb = std::move(st->cb); - cb(0); - } - }, [st](int res, const std::string & value) - { - return res == -ENOENT; - }); - } - }, [st](int res, const std::string & value) - { - // CAS compare - check that the key doesn't exist - if (res == 0) - { - std::string err; - auto direntry = json11::Json::parse(value, err); - if (err != "") - { - fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, overwriting\n", - kv_direntry_key(st->dir_ino, st->filename).c_str(), value.c_str(), err.c_str()); - return true; - } - if (st->exclusive && direntry["verf"].uint64_value() == st->verf) - { - st->dup_ino = direntry["ino"].uint64_value(); - return false; - } - return false; - } - return true; - }); - }); -} - -static void kv_create_setattr(json11::Json::object & attrobj, sattr3 & sattr) -{ - if (sattr.mode.set_it) - attrobj["mode"] = (uint64_t)sattr.mode.mode; - if (sattr.uid.set_it) - attrobj["uid"] = (uint64_t)sattr.uid.uid; - if (sattr.gid.set_it) - attrobj["gid"] = (uint64_t)sattr.gid.gid; - if (sattr.atime.set_it) - attrobj["atime"] = nfstime_to_str(sattr.atime.atime); - if (sattr.mtime.set_it) - attrobj["mtime"] = nfstime_to_str(sattr.mtime.mtime); -} - -template static void kv_create_reply(kv_create_state *st, int res) -{ - T *reply = (T*)st->rop->reply; - if (res < 0) - { - *reply = (T){ .status = vitastor_nfs_map_err(-res) }; - } - else - { - *reply = (T){ - .status = NFS3_OK, - .resok = (Tok){ - .obj = { - .handle_follows = 1, - .handle = xdr_copy_string(st->rop->xdrs, kv_fh(st->new_id)), - }, - .obj_attributes = { - .attributes_follow = 1, - .attributes = get_kv_attributes(st->self, st->new_id, st->attrs), - }, - }, - }; - } - rpc_queue_reply(st->rop); - delete st; -} - -static int nfs3_create_proc(void *opaque, rpc_op_t *rop) -{ - kv_create_state *st = new kv_create_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - auto args = (CREATE3args*)rop->request; - st->exclusive = args->how.mode == NFS_EXCLUSIVE; - st->verf = st->exclusive ? *(uint64_t*)&args->how.verf : 0; - st->dir_ino = kv_fh_inode(args->where.dir); - st->filename = args->where.name; - if (args->how.mode == NFS_EXCLUSIVE) - { - st->attrobj["verf"] = *(uint64_t*)&args->how.verf; - } - else if (args->how.mode == NFS_UNCHECKED) - { - kv_create_setattr(st->attrobj, args->how.obj_attributes); - if (args->how.obj_attributes.size.set_it) - { - st->attrobj["size"] = (uint64_t)args->how.obj_attributes.size.size; - st->attrobj["empty"] = true; - } - } - st->cb = [st](int res) { kv_create_reply(st, res); }; - kv_do_create(st); - return 1; -} - -static int nfs3_mkdir_proc(void *opaque, rpc_op_t *rop) -{ - kv_create_state *st = new kv_create_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - auto args = (MKDIR3args*)rop->request; - st->dir_ino = kv_fh_inode(args->where.dir); - st->filename = args->where.name; - st->attrobj["type"] = "dir"; - st->attrobj["parent_ino"] = st->dir_ino; - kv_create_setattr(st->attrobj, args->attributes); - st->cb = [st](int res) { kv_create_reply(st, res); }; - kv_do_create(st); - return 1; -} - -static int nfs3_symlink_proc(void *opaque, rpc_op_t *rop) -{ - kv_create_state *st = new kv_create_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - auto args = (SYMLINK3args*)rop->request; - st->dir_ino = kv_fh_inode(args->where.dir); - st->filename = args->where.name; - st->attrobj["type"] = "link"; - st->attrobj["symlink"] = (std::string)args->symlink.symlink_data; - kv_create_setattr(st->attrobj, args->symlink.symlink_attributes); - st->cb = [st](int res) { kv_create_reply(st, res); }; - kv_do_create(st); - return 1; -} - -static int nfs3_mknod_proc(void *opaque, rpc_op_t *rop) -{ - kv_create_state *st = new kv_create_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - auto args = (MKNOD3args*)rop->request; - st->dir_ino = kv_fh_inode(args->where.dir); - st->filename = args->where.name; - if (args->what.type == NF3CHR || args->what.type == NF3BLK) - { - st->attrobj["type"] = (args->what.type == NF3CHR ? "chr" : "blk"); - st->attrobj["major"] = (uint64_t)args->what.chr_device.spec.specdata1; - st->attrobj["minor"] = (uint64_t)args->what.chr_device.spec.specdata2; - kv_create_setattr(st->attrobj, args->what.chr_device.dev_attributes); - } - else if (args->what.type == NF3SOCK || args->what.type == NF3FIFO) - { - st->attrobj["type"] = (args->what.type == NF3SOCK ? "sock" : "fifo"); - kv_create_setattr(st->attrobj, args->what.sock_attributes); - } - else - { - *(MKNOD3res*)rop->reply = (MKNOD3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - delete st; - return 0; - } - st->cb = [st](int res) { kv_create_reply(st, res); }; - kv_do_create(st); - return 1; -} - -struct kv_del_state -{ - nfs_client_t *self = NULL; - rpc_op_t *rop = NULL; - uint64_t dir_ino = 0; - std::string filename; - uint64_t ino = 0; - void *list_handle = NULL; - std::string prefix, list_key, direntry_text, ientry_text; - json11::Json direntry, ientry; - int type = 0; - bool is_rmdir = false; - bool rm_data = false; - int res = 0, res2 = 0; - std::function cb; -}; - -static void nfs_kv_continue_delete(kv_del_state *st, int state) -{ - // Overall algorithm: - // 1) Get inode attributes and check that it's not a directory (REMOVE) - // 2) Get inode attributes and check that it is a directory (RMDIR) - // 3) Delete direntry with CAS - // 4) Check that the directory didn't contain files (RMDIR) and restore it if it did - // 5) Reduce inode refcount by 1 or delete inode - // 6) If regular file and inode is deleted: delete data - if (state == 0) {} - else if (state == 1) goto resume_1; - else if (state == 2) goto resume_2; - else if (state == 3) goto resume_3; - else if (state == 4) goto resume_4; - else if (state == 5) goto resume_5; - else if (state == 6) goto resume_6; - else if (state == 7) goto resume_7; - else - { - fprintf(stderr, "BUG: invalid state in nfs_kv_continue_delete()"); - abort(); - } - st->self->parent->db->get(kv_direntry_key(st->dir_ino, st->filename), [st](int res, const std::string & value) - { - st->res = res; - st->direntry_text = value; - nfs_kv_continue_delete(st, 1); - }); - return; -resume_1: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - { - std::string err; - st->direntry = json11::Json::parse(st->direntry_text, err); - if (err != "") - { - fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, deleting\n", - kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), err.c_str()); - // Just delete direntry and skip inode - } - else - { - st->ino = st->direntry["ino"].uint64_value(); - } - } - // Get inode - st->self->parent->db->get(kv_inode_key(st->ino), [st](int res, const std::string & value) - { - st->res = res; - st->ientry_text = value; - nfs_kv_continue_delete(st, 2); - }); - return; -resume_2: - if (st->res < 0) - { - fprintf(stderr, "error reading inode %s: %s (code %d)\n", - kv_inode_key(st->ino).c_str(), strerror(-st->res), st->res); - auto cb = std::move(st->cb); - cb(st->res); - return; - } - { - std::string err; - st->ientry = json11::Json::parse(st->ientry_text, err); - if (err != "") - { - fprintf(stderr, "Invalid JSON in inode %s = %s: %s, treating as a regular file\n", - kv_inode_key(st->ino).c_str(), st->ientry_text.c_str(), err.c_str()); - } - } - // (1-2) Check type - st->type = kv_map_type(st->ientry["type"].string_value()); - if (st->type == -1 || st->is_rmdir != (st->type == NF3DIR)) - { - auto cb = std::move(st->cb); - cb(st->is_rmdir ? -ENOTDIR : -EISDIR); - return; - } - // (3) Delete direntry with CAS - st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res) - { - st->res = res; - nfs_kv_continue_delete(st, 3); - }, [st](int res, const std::string & value) - { - return value == st->direntry_text; - }); - return; -resume_3: - if (st->res == -EAGAIN) - { - // CAS failure, restart from the beginning - nfs_kv_continue_delete(st, 0); - return; - } - else if (st->res < 0 && st->res != -ENOENT) - { - fprintf(stderr, "failed to remove direntry %s: %s (code %d)\n", - kv_direntry_key(st->dir_ino, st->filename).c_str(), strerror(-st->res), st->res); - auto cb = std::move(st->cb); - cb(st->res); - return; - } - if (!st->ino) - { - // direntry contained invalid JSON and was deleted, finish - auto cb = std::move(st->cb); - cb(0); - return; - } - if (st->is_rmdir) - { - // (4) Check if directory actually is not empty - st->list_handle = st->self->parent->db->list_start(kv_direntry_key(st->ino, "")); - st->self->parent->db->list_next(st->list_handle, [st](int res, const std::string & key, const std::string & value) - { - st->res = res; - st->list_key = key; - st->self->parent->db->list_close(st->list_handle); - nfs_kv_continue_delete(st, 4); - }); - return; -resume_4: - st->prefix = kv_direntry_key(st->ino, ""); - if (st->res == -ENOENT || st->list_key.size() < st->prefix.size() || st->list_key.substr(0, st->prefix.size()) != st->prefix) - { - // OK, directory is empty - } - else - { - // Not OK, restore direntry - st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res) - { - st->res2 = res; - nfs_kv_continue_delete(st, 5); - }, [st](int res, const std::string & value) - { - return res == -ENOENT; - }); - return; -resume_5: - if (st->res2 < 0) - { - fprintf(stderr, "failed to restore direntry %s (%s): %s (code %d)", - kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), strerror(-st->res2), st->res2); - fprintf(stderr, " - inode %ju may be left as garbage\n", st->ino); - } - if (st->res < 0) - { - fprintf(stderr, "failed to list entries from %s: %s (code %d)\n", - kv_direntry_key(st->ino, "").c_str(), strerror(-st->res), st->res); - } - auto cb = std::move(st->cb); - cb(st->res < 0 ? st->res : -ENOTEMPTY); - return; - } - } - // (5) Reduce inode refcount by 1 or delete inode - if (st->ientry["nlink"].uint64_value() > 1) - { - auto copy = st->ientry.object_items(); - copy["nlink"] = st->ientry["nlink"].uint64_value()-1; - st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(copy).dump(), [st](int res) - { - st->res = res; - nfs_kv_continue_delete(st, 6); - }, [st](int res, const std::string & old_value) - { - return old_value == st->ientry_text; - }); - } - else - { - st->self->parent->db->del(kv_inode_key(st->ino), [st](int res) - { - st->res = res; - nfs_kv_continue_delete(st, 6); - }, [st](int res, const std::string & old_value) - { - return old_value == st->ientry_text; - }); - } - return; -resume_6: - if (st->res < 0) - { - // Assume EAGAIN is OK, maybe someone created a hard link in the meantime - auto cb = std::move(st->cb); - cb(st->res == -EAGAIN ? 0 : st->res); - return; - } - // (6) If regular file and inode is deleted: delete data - if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1) - { - // Remove data - st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { - { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) }, - { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) }, - }), [st](const cli_result_t & r) - { - if (r.err) - { - fprintf(stderr, "Failed to remove inode %jx data: %s (code %d)\n", - st->ino, r.text.c_str(), r.err); - } - st->res = r.err; - nfs_kv_continue_delete(st, 7); - }); - return; -resume_7: - auto cb = std::move(st->cb); - cb(st->res); - return; - } - auto cb = std::move(st->cb); - cb(0); -} - -static int nfs3_remove_proc(void *opaque, rpc_op_t *rop) -{ - kv_del_state *st = new kv_del_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - REMOVE3res *reply = (REMOVE3res*)rop->reply; - REMOVE3args *args = (REMOVE3args*)rop->request; - st->dir_ino = kv_fh_inode(args->object.dir); - st->filename = args->object.name; - if (st->self->parent->trace) - fprintf(stderr, "[%d] REMOVE %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str()); - if (!st->dir_ino) - { - *reply = (REMOVE3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - delete st; - return 0; - } - st->cb = [st](int res) - { - *((REMOVE3res*)st->rop->reply) = (REMOVE3res){ - .status = vitastor_nfs_map_err(res), - }; - rpc_queue_reply(st->rop); - delete st; - }; - nfs_kv_continue_delete(st, 0); - return 1; -} - -static int nfs3_rmdir_proc(void *opaque, rpc_op_t *rop) -{ - kv_del_state *st = new kv_del_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - RMDIR3args *args = (RMDIR3args*)rop->request; - RMDIR3res *reply = (RMDIR3res*)rop->reply; - st->dir_ino = kv_fh_inode(args->object.dir); - st->filename = args->object.name; - st->is_rmdir = true; - if (st->self->parent->trace) - fprintf(stderr, "[%d] RMDIR %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str()); - if (!st->dir_ino) - { - *reply = (RMDIR3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - delete st; - return 0; - } - st->cb = [st](int res) - { - *((RMDIR3res*)st->rop->reply) = (RMDIR3res){ - .status = vitastor_nfs_map_err(res), - }; - rpc_queue_reply(st->rop); - delete st; - }; - nfs_kv_continue_delete(st, 0); - return 1; -} - -struct nfs_kv_rename_state -{ - nfs_client_t *self = NULL; - rpc_op_t *rop = NULL; - uint64_t old_dir_ino = 0, new_dir_ino = 0; - std::string old_name, new_name; - std::string old_direntry_text; - std::string ientry_text; - json11::Json direntry, ientry; - int res = 0, res2 = 0; - std::function cb; -}; - -static void nfs_kv_continue_rename(nfs_kv_rename_state *st, int state) -{ - // Simplified algorithm (non-atomic and without ENOTDIR/EISDIR): - // 1) Check if the target directory exists - // 2) Delete & save (using CAS) the source direntry - // 3) Write the target direntry using CAS, fail if it already exists - // 4) Restore the source direntry on failure - // Atomic version would require something like a journal - if (state == 0) {} - else if (state == 1) goto resume_1; - else if (state == 2) goto resume_2; - else if (state == 3) goto resume_3; - else if (state == 4) goto resume_4; - else if (state == 5) goto resume_5; - else if (state == 6) goto resume_6; - else - { - fprintf(stderr, "BUG: invalid state in nfs_kv_continue_rename()"); - abort(); - } - kv_read_inode(st->self, st->new_dir_ino, [st](int res, const std::string & value, json11::Json attrs) - { - st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res; - nfs_kv_continue_rename(st, 1); - }); - return; -resume_1: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - // Read & delete the old direntry - st->self->parent->db->del(kv_direntry_key(st->old_dir_ino, st->old_name), [st](int res) - { - st->res = res; - nfs_kv_continue_rename(st, 2); - }, [=](int res, const std::string & old_value) - { - st->res = res; - st->old_direntry_text = old_value; - return true; - }); - return; -resume_2: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - { - std::string err; - st->direntry = json11::Json::parse(st->old_direntry_text, err); - if (err != "") - { - fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n", - kv_direntry_key(st->old_dir_ino, st->old_name).c_str(), - st->old_direntry_text.c_str(), err.c_str()); - } - } - if (st->direntry["type"].string_value() == "dir" && - st->direntry["ino"].uint64_value() != 0 && - st->new_dir_ino != st->old_dir_ino) - { - // Read & check inode - kv_read_inode(st->self, st->direntry["ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry) - { - st->res = res; - st->ientry_text = value; - st->ientry = ientry; - nfs_kv_continue_rename(st, 3); - }); - return; -resume_3: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - // Change parent reference - { - auto ientry_new = st->ientry.object_items(); - ientry_new["parent_ino"] = st->new_dir_ino; - st->self->parent->db->set(kv_inode_key(st->direntry["ino"].uint64_value()), json11::Json(ientry_new).dump(), [st](int res) - { - st->res = res; - nfs_kv_continue_rename(st, 4); - }, [st](int res, const std::string & old_value) - { - return old_value == st->ientry_text; - }); - } - return; -resume_4: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - } - st->self->parent->db->set(kv_direntry_key(st->new_dir_ino, st->new_name), st->old_direntry_text, [st](int res) - { - st->res = res; - nfs_kv_continue_rename(st, 5); - }, [st](int res, const std::string & old_value) - { - return res == -ENOENT; - }); - return; -resume_5: - if (st->res < 0) - { - if (st->res == -EAGAIN) - st->res = -EEXIST; - st->res2 = st->res; - st->self->parent->db->set(kv_direntry_key(st->old_dir_ino, st->old_name), st->old_direntry_text, [st](int res) - { - st->res = res; - nfs_kv_continue_rename(st, 6); - }, [st](int res, const std::string & old_value) - { - return res == -ENOENT; - }); - return; -resume_6: - if (st->res < 0) - { - if (st->res == -EAGAIN) - st->res = -EEXIST; - fprintf(stderr, "error restoring %s = %s after failed rename: %s (code %d)\n", - kv_direntry_key(st->old_dir_ino, st->old_name).c_str(), st->old_direntry_text.c_str(), - strerror(-st->res), st->res); - } - auto cb = std::move(st->cb); - cb(st->res2); - return; - } - auto cb = std::move(st->cb); - cb(st->res); -} - -static int nfs3_rename_proc(void *opaque, rpc_op_t *rop) -{ - auto st = new nfs_kv_rename_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - RENAME3args *args = (RENAME3args*)rop->request; - st->old_dir_ino = kv_fh_inode(args->from.dir); - st->new_dir_ino = kv_fh_inode(args->to.dir); - st->old_name = args->from.name; - st->new_name = args->to.name; - if (st->self->parent->trace) - fprintf(stderr, "[%d] RENAME %ju/%s -> %ju/%s\n", st->self->nfs_fd, st->old_dir_ino, st->old_name.c_str(), st->new_dir_ino, st->new_name.c_str()); - if (!st->old_dir_ino || !st->new_dir_ino || st->old_name == "" || st->new_name == "") - { - RENAME3res *reply = (RENAME3res*)rop->reply; - *reply = (RENAME3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - delete st; - return 0; - } - if (st->old_dir_ino == st->new_dir_ino && st->old_name == st->new_name) - { - RENAME3res *reply = (RENAME3res*)rop->reply; - *reply = (RENAME3res){ .status = NFS3_OK }; - rpc_queue_reply(st->rop); - delete st; - return 0; - } - st->cb = [st](int res) - { - RENAME3res *reply = (RENAME3res*)st->rop->reply; - *reply = (RENAME3res){ .status = vitastor_nfs_map_err(res) }; - rpc_queue_reply(st->rop); - delete st; - }; - nfs_kv_continue_rename(st, 0); - return 1; -} - -struct nfs_kv_link_state -{ - nfs_client_t *self = NULL; - rpc_op_t *rop = NULL; - uint64_t ino = 0; - uint64_t dir_ino = 0; - std::string filename; - std::string ientry_text; - json11::Json ientry; - bool retrying = false; - int wait = 0; - int res = 0, res2 = 0; - std::function cb; -}; - -static void nfs_kv_continue_link(nfs_kv_link_state *st, int state) -{ - // 1) Read the source inode - // 2) If it's a directory - fail with -EISDIR - // 3) Create the new direntry with the same inode reference - // 4) Update the inode entry with refcount++ - // 5) Retry update if CAS failed but the inode exists - // 6) Otherwise fail and remove the new direntry - // Yeah we may leave a bad direntry if we crash - // But the other option is to possibly leave an inode with too big refcount - if (state == 0) {} - else if (state == 1) goto resume_1; - else if (state == 2) goto resume_2; - else if (state == 3) goto resume_3; - else if (state == 4) goto resume_4; - else - { - fprintf(stderr, "BUG: invalid state in nfs_kv_continue_link()"); - abort(); - } -resume_0: - // Check that the source inode exists and is not a directory - st->wait = st->retrying ? 1 : 2; - st->res2 = 0; - kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs) - { - st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? -EISDIR : 0) : res; - st->ientry_text = value; - st->ientry = attrs; - if (!--st->wait) - nfs_kv_continue_link(st, 1); - }); - if (!st->retrying) - { - // Check that the new directory exists - kv_read_inode(st->self, st->dir_ino, [st](int res, const std::string & value, json11::Json attrs) - { - st->res2 = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res; - if (!--st->wait) - nfs_kv_continue_link(st, 1); - }); - } - return; -resume_1: - if (st->res < 0 || st->res2 < 0) - { - auto cb = std::move(st->cb); - cb(st->res < 0 ? st->res : st->res2); - return; - } - // Write the new direntry - if (!st->retrying) - { - st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename), - json11::Json(json11::Json::object{ { "ino", st->ino } }).dump(), [st](int res) - { - st->res = res; - nfs_kv_continue_link(st, 2); - }, [st](int res, const std::string & old_value) - { - return res == -ENOENT; - }); - return; -resume_2: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - } - // Increase inode refcount - { - auto new_ientry = st->ientry.object_items(); - auto nlink = new_ientry["nlink"].uint64_value(); - new_ientry["nlink"] = nlink ? nlink+1 : 2; - st->ientry = new_ientry; - } - st->self->parent->db->set(kv_inode_key(st->ino), st->ientry.dump(), [st](int res) - { - st->res = res; - nfs_kv_continue_link(st, 3); - }, [st](int res, const std::string & old_value) - { - st->res2 = res; - return res == 0 && old_value == st->ientry_text; - }); - return; -resume_3: - if (st->res2 == -ENOENT) - { - st->res = -ENOENT; - } - if (st->res == -EAGAIN) - { - // Re-read inode and retry - st->retrying = true; - goto resume_0; - } - if (st->res < 0) - { - // Maybe inode was deleted in the meantime, delete our direntry - st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res) - { - st->res2 = res; - nfs_kv_continue_link(st, 4); - }); - return; -resume_4: - if (st->res2 < 0) - { - fprintf(stderr, "Warning: failed to delete new linked direntry %ju/%s: %s (code %d)\n", - st->dir_ino, st->filename.c_str(), strerror(-st->res2), st->res2); - } - } - auto cb = std::move(st->cb); - cb(st->res); -} - -// FIXME: We'll need some tests for the FS -static int nfs3_link_proc(void *opaque, rpc_op_t *rop) -{ - auto st = new nfs_kv_link_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - LINK3args *args = (LINK3args*)rop->request; - st->ino = kv_fh_inode(args->file); - st->dir_ino = kv_fh_inode(args->link.dir); - st->filename = args->link.name; - if (st->self->parent->trace) - fprintf(stderr, "[%d] LINK %ju -> %ju/%s\n", st->self->nfs_fd, st->ino, st->dir_ino, st->filename.c_str()); - if (!st->ino || !st->dir_ino || st->filename == "") - { - LINK3res *reply = (LINK3res*)rop->reply; - *reply = (LINK3res){ .status = NFS3ERR_INVAL }; - rpc_queue_reply(rop); - delete st; - return 0; - } - st->cb = [st](int res) - { - LINK3res *reply = (LINK3res*)st->rop->reply; - if (res < 0) - { - *reply = (LINK3res){ .status = vitastor_nfs_map_err(res) }; - } - else - { - *reply = (LINK3res){ - .status = NFS3_OK, - .resok = (LINK3resok){ - .file_attributes = (post_op_attr){ - .attributes_follow = 1, - .attributes = get_kv_attributes(st->self, st->ino, st->ientry), - }, - }, - }; - } - rpc_queue_reply(st->rop); - delete st; - }; - nfs_kv_continue_link(st, 0); - return 1; -} - -struct nfs_kv_readdir_state -{ - nfs_client_t *self = NULL; - rpc_op_t *rop = NULL; - // Request: - bool is_plus = false; - uint64_t cookie = 0; - uint64_t cookieverf = 0; - uint64_t dir_ino = 0; - uint64_t maxcount = 0; - std::function cb; - // State: - int res = 0; - std::string prefix, start; - void *list_handle; - uint64_t parent_ino = 0; - std::string ientry_text, parent_ientry_text; - json11::Json ientry, parent_ientry; - std::string cur_key, cur_value; - int reply_size = 0; - int to_skip = 0; - uint64_t offset = 0; - int getattr_running = 0, getattr_cur = 0; - // Result: - bool eof = false; - //uint64_t cookieverf = 0; // same field - std::vector entries; -}; - -static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state); - -static void kv_getattr_next(nfs_kv_readdir_state *st) -{ - while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->readdir_getattr_parallel) - { - auto idx = st->getattr_cur++; - st->getattr_running++; - kv_read_inode(st->self, st->entries[idx].fileid, [st, idx](int res, const std::string & value, json11::Json ientry) - { - if (res == 0) - { - st->entries[idx].name_attributes = (post_op_attr){ - // FIXME: maybe do not read parent attributes and leave them to a GETATTR? - .attributes_follow = 1, - .attributes = get_kv_attributes(st->self, st->entries[idx].fileid, ientry), - }; - } - st->getattr_running--; - kv_getattr_next(st); - if (st->getattr_running == 0 && !st->list_handle) - { - nfs_kv_continue_readdir(st, 4); - } - }); - } -} - -static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state) -{ - if (state == 0) {} - else if (state == 1) goto resume_1; - else if (state == 2) goto resume_2; - else if (state == 3) goto resume_3; - else if (state == 4) goto resume_4; - else - { - fprintf(stderr, "BUG: invalid state in nfs_kv_continue_readdir()"); - abort(); - } - // Limit results based on maximum reply size - // Sadly we have to calculate reply size by hand - // reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes - st->reply_size = 20; - if (st->reply_size > st->maxcount) - { - // Error, too small max reply size - auto cb = std::move(st->cb); - cb(-NFS3ERR_TOOSMALL); - return; - } - // Add . and .. - if (st->cookie <= 1) - { - kv_read_inode(st->self, st->dir_ino, [st](int res, const std::string & value, json11::Json ientry) - { - st->res = res; - st->ientry_text = value; - st->ientry = ientry; - nfs_kv_continue_readdir(st, 1); - }); - return; -resume_1: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - if (st->cookie == 0) - { - auto fh = kv_fh(st->dir_ino); - auto entry_size = 20 + 4/*len_pad4(".")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0); - if (st->reply_size + entry_size > st->maxcount) - { - auto cb = std::move(st->cb); - cb(-NFS3ERR_TOOSMALL); - return; - } - entryplus3 dot = {}; - dot.name = xdr_copy_string(st->rop->xdrs, "."); - dot.fileid = st->dir_ino; - dot.name_attributes = (post_op_attr){ - .attributes_follow = 1, - .attributes = get_kv_attributes(st->self, st->dir_ino, st->ientry), - }; - dot.name_handle = (post_op_fh3){ - .handle_follows = 1, - .handle = xdr_copy_string(st->rop->xdrs, fh), - }; - st->entries.push_back(dot); - st->reply_size += entry_size; - } - st->parent_ino = st->ientry["parent_ino"].uint64_value(); - if (st->parent_ino) - { - kv_read_inode(st->self, st->ientry["parent_ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry) - { - st->res = res; - st->parent_ientry_text = value; - st->parent_ientry = ientry; - nfs_kv_continue_readdir(st, 2); - }); - return; -resume_2: - if (st->res < 0) - { - auto cb = std::move(st->cb); - cb(st->res); - return; - } - } - auto fh = kv_fh(st->parent_ino); - auto entry_size = 20 + 4/*len_pad4("..")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0); - if (st->reply_size + entry_size > st->maxcount) - { - st->eof = false; - auto cb = std::move(st->cb); - cb(0); - return; - } - entryplus3 dotdot = {}; - dotdot.name = xdr_copy_string(st->rop->xdrs, ".."); - dotdot.fileid = st->dir_ino; - dotdot.name_attributes = (post_op_attr){ - // FIXME: maybe do not read parent attributes and leave them to a GETATTR? - .attributes_follow = 1, - .attributes = get_kv_attributes(st->self, - st->parent_ino ? st->parent_ino : st->dir_ino, - st->parent_ino ? st->parent_ientry : st->ientry), - }; - dotdot.name_handle = (post_op_fh3){ - .handle_follows = 1, - .handle = xdr_copy_string(st->rop->xdrs, fh), - }; - st->entries.push_back(dotdot); - st->reply_size += entry_size; - } - st->prefix = kv_direntry_key(st->dir_ino, ""); - st->eof = true; - st->start = st->prefix; - if (st->cookie > 1) - { - auto lc_it = st->self->parent->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie }); - if (lc_it != st->self->parent->list_cookies.end()) - { - st->start = lc_it->second.key; - st->to_skip = 1; - st->offset = st->cookie; - } - else - { - st->to_skip = st->cookie-2; - st->offset = 2; - st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62)); - } - } - else - { - st->to_skip = 0; - st->offset = 2; - st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62)); - } - { - auto lc_it = st->self->parent->list_cookies.lower_bound((list_cookie_t){ st->dir_ino, st->cookieverf, 0 }); - if (lc_it != st->self->parent->list_cookies.end() && - lc_it->first.dir_ino == st->dir_ino && - lc_it->first.cookieverf == st->cookieverf && - lc_it->first.cookie < st->cookie) - { - auto lc_start = lc_it; - while (lc_it != st->self->parent->list_cookies.end() && lc_it->first.cookieverf == st->cookieverf) - { - lc_it++; - } - st->self->parent->list_cookies.erase(lc_start, lc_it); - } - } - st->getattr_cur = st->entries.size(); - st->list_handle = st->self->parent->db->list_start(st->start); - st->self->parent->db->list_next(st->list_handle, [=](int res, const std::string & key, const std::string & value) - { - st->res = res; - st->cur_key = key; - st->cur_value = value; - nfs_kv_continue_readdir(st, 3); - }); - return; - while (st->list_handle) - { - st->self->parent->db->list_next(st->list_handle, NULL); - return; -resume_3: - if (st->res == -ENOENT || st->cur_key.size() > st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix) - { - st->self->parent->db->list_close(st->list_handle); - st->list_handle = NULL; - break; - } - if (st->to_skip > 0) - { - st->to_skip--; - continue; - } - std::string err; - auto direntry = json11::Json::parse(st->cur_value, err); - if (err != "") - { - fprintf(stderr, "readdir: direntry %s contains invalid JSON: %s, skipping\n", - st->cur_key.c_str(), st->cur_value.c_str()); - continue; - } - auto ino = direntry["ino"].uint64_value(); - auto name = kv_direntry_filename(st->cur_key); - auto fh = kv_fh(ino); - // 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes - // 1 entryplus3 is (8+4+(filename_len+3)/4*4+8 - // + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0) - // + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes - auto entry_size = 20 + len_pad4(name.size()) + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0); - if (st->reply_size + entry_size > st->maxcount) - { - st->eof = false; - st->self->parent->db->list_close(st->list_handle); - st->list_handle = NULL; - break; - } - st->reply_size += entry_size; - auto idx = st->entries.size(); - st->entries.push_back((entryplus3){}); - auto entry = &st->entries[idx]; - entry->name = xdr_copy_string(st->rop->xdrs, name); - entry->fileid = ino; - entry->cookie = st->offset++; - st->self->parent->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = entry->name }; - if (st->is_plus) - { - entry->name_handle = (post_op_fh3){ - .handle_follows = 1, - .handle = xdr_copy_string(st->rop->xdrs, fh), - }; - kv_getattr_next(st); - } - st->self->parent->db->list_next(st->list_handle, NULL); - } -resume_4: - while (st->getattr_running > 0) - { - return; - } - void *prev = NULL; - for (int i = 0; i < st->entries.size(); i++) - { - entryplus3 *entry = &st->entries[i]; - if (prev) - { - if (st->is_plus) - ((entryplus3*)prev)->nextentry = entry; - else - ((entry3*)prev)->nextentry = (entry3*)entry; - } - prev = entry; - } - // Send reply - auto cb = std::move(st->cb); - cb(0); -} - -static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus) -{ - auto st = new nfs_kv_readdir_state; - st->self = (nfs_client_t*)opaque; - st->rop = rop; - st->is_plus = is_plus; - if (st->is_plus) - { - READDIRPLUS3args *args = (READDIRPLUS3args*)rop->request; - st->dir_ino = kv_fh_inode(args->dir); - st->cookie = args->cookie; - st->cookieverf = *((uint64_t*)args->cookieverf); - st->maxcount = args->maxcount; - } - else - { - READDIR3args *args = ((READDIR3args*)rop->request); - st->dir_ino = kv_fh_inode(args->dir); - st->cookie = args->cookie; - st->cookieverf = *((uint64_t*)args->cookieverf); - st->maxcount = args->count; - } - if (st->self->parent->trace) - fprintf(stderr, "[%d] READDIR %ju VERF %jx OFFSET %ju LIMIT %ju\n", st->self->nfs_fd, st->dir_ino, st->cookieverf, st->cookie, st->maxcount); - st->cb = [st](int res) - { - if (st->is_plus) - { - READDIRPLUS3res *reply = (READDIRPLUS3res*)st->rop->reply; - *reply = (READDIRPLUS3res){ .status = vitastor_nfs_map_err(res) }; - *(uint64_t*)(reply->resok.cookieverf) = st->cookieverf; - reply->resok.reply.entries = st->entries.size() ? &st->entries[0] : NULL; - reply->resok.reply.eof = st->eof; - } - else - { - READDIR3res *reply = (READDIR3res*)st->rop->reply; - *reply = (READDIR3res){ .status = vitastor_nfs_map_err(res) }; - *(uint64_t*)(reply->resok.cookieverf) = st->cookieverf; - reply->resok.reply.entries = st->entries.size() ? (entry3*)&st->entries[0] : NULL; - reply->resok.reply.eof = st->eof; - } - rpc_queue_reply(st->rop); - delete st; - }; - nfs_kv_continue_readdir(st, 0); -} - -static int nfs3_readdir_proc(void *opaque, rpc_op_t *rop) -{ - nfs3_readdir_common(opaque, rop, false); - return 0; -} - -static int nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop) -{ - nfs3_readdir_common(opaque, rop, true); - return 0; -} - -// Get file system statistics -static int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - //FSSTAT3args *args = (FSSTAT3args*)rop->request; - if (self->parent->trace) - fprintf(stderr, "[%d] FSSTAT\n", self->nfs_fd); - FSSTAT3res *reply = (FSSTAT3res*)rop->reply; - uint64_t tbytes = 0, fbytes = 0; - auto pst_it = self->parent->pool_stats.find(self->parent->default_pool_id); - if (pst_it != self->parent->pool_stats.end()) - { - auto ttb = pst_it->second["total_raw_tb"].number_value(); - auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value()); - tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40); - fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40); - } - *reply = (FSSTAT3res){ - .status = NFS3_OK, - .resok = (FSSTAT3resok){ - .obj_attributes = { - .attributes_follow = 0, - //.attributes = get_root_attributes(self), - }, - .tbytes = tbytes, // total bytes - .fbytes = fbytes, // free bytes - .abytes = fbytes, // available bytes - .tfiles = (size3)(1 << 31), // maximum total files - .ffiles = (size3)(1 << 31), // free files - .afiles = (size3)(1 << 31), // available files - .invarsec = 0, - }, - }; - rpc_queue_reply(rop); - return 0; -} - -static int nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - FSINFO3args *args = (FSINFO3args*)rop->request; - FSINFO3res *reply = (FSINFO3res*)rop->reply; - if (self->parent->trace) - fprintf(stderr, "[%d] FSINFO %s\n", self->nfs_fd, std::string(args->fsroot).c_str()); - if (args->fsroot != kv_root_handle) - { - *reply = (FSINFO3res){ .status = NFS3ERR_INVAL }; - } - else - { - // Fill info - *reply = (FSINFO3res){ - .status = NFS3_OK, - .resok = (FSINFO3resok){ - .obj_attributes = { - .attributes_follow = 0, - //.attributes = get_root_attributes(self), - }, - .rtmax = 128*1024*1024, - .rtpref = 128*1024*1024, - .rtmult = 4096, - .wtmax = 128*1024*1024, - .wtpref = 128*1024*1024, - .wtmult = 4096, - .dtpref = 128, - .maxfilesize = 0x7fffffffffffffff, - .time_delta = { - .seconds = 1, - .nseconds = 0, - }, - .properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS, - }, - }; - } - rpc_queue_reply(rop); - return 0; -} - -static int nfs3_pathconf_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - PATHCONF3args *args = (PATHCONF3args*)rop->request; - PATHCONF3res *reply = (PATHCONF3res*)rop->reply; - if (self->parent->trace) - fprintf(stderr, "[%d] PATHCONF %s\n", self->nfs_fd, std::string(args->object).c_str()); - if (args->object != kv_root_handle) - { - *reply = (PATHCONF3res){ .status = NFS3ERR_INVAL }; - } - else - { - // Fill info - *reply = (PATHCONF3res){ - .status = NFS3_OK, - .resok = (PATHCONF3resok){ - .obj_attributes = { - // Without at least one reference to a non-constant value (local variable or something else), - // with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here - // FIXME: get rid of this after raising compiler requirement - .attributes_follow = 0, - //.attributes = get_root_attributes(self), - }, - .linkmax = 0, - .name_max = 255, - .no_trunc = TRUE, - .chown_restricted = FALSE, - .case_insensitive = FALSE, - .case_preserving = TRUE, - }, - }; - } - rpc_queue_reply(rop); - return 0; -} - -static int nfs3_commit_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - //COMMIT3args *args = (COMMIT3args*)rop->request; - cluster_op_t *op = new cluster_op_t; - // fsync. we don't know how to fsync a single inode, so just fsync everything - op->opcode = OSD_OP_SYNC; - op->callback = [self, rop](cluster_op_t *op) - { - COMMIT3res *reply = (COMMIT3res*)rop->reply; - *reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) }; - *(uint64_t*)reply->resok.verf = self->parent->server_id; - rpc_queue_reply(rop); - }; - self->parent->cli->execute(op); - return 1; -} - -static int mount3_mnt_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - //nfs_dirpath *args = (nfs_dirpath*)rop->request; - if (self->parent->trace) - fprintf(stderr, "[%d] MNT\n", self->nfs_fd); - nfs_mountres3 *reply = (nfs_mountres3*)rop->reply; - u_int flavor = RPC_AUTH_NONE; - reply->fhs_status = MNT3_OK; - reply->mountinfo.fhandle = xdr_copy_string(rop->xdrs, kv_root_handle); - reply->mountinfo.auth_flavors.auth_flavors_len = 1; - reply->mountinfo.auth_flavors.auth_flavors_val = (u_int*)xdr_copy_string(rop->xdrs, (char*)&flavor, sizeof(u_int)).data; - rpc_queue_reply(rop); - return 0; -} - -static int mount3_dump_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - if (self->parent->trace) - fprintf(stderr, "[%d] DUMP\n", self->nfs_fd); - nfs_mountlist *reply = (nfs_mountlist*)rop->reply; - *reply = (struct nfs_mountbody*)malloc_or_die(sizeof(struct nfs_mountbody)); - xdr_add_malloc(rop->xdrs, *reply); - (*reply)->ml_hostname = xdr_copy_string(rop->xdrs, "127.0.0.1"); - (*reply)->ml_directory = xdr_copy_string(rop->xdrs, self->parent->export_root); - (*reply)->ml_next = NULL; - rpc_queue_reply(rop); - return 0; -} - -static int mount3_umnt_proc(void *opaque, rpc_op_t *rop) -{ - //nfs_client_t *self = (nfs_client_t*)opaque; - //nfs_dirpath *arg = (nfs_dirpath*)rop->request; - // do nothing - rpc_queue_reply(rop); - return 0; -} - -static int mount3_umntall_proc(void *opaque, rpc_op_t *rop) -{ - // do nothing - rpc_queue_reply(rop); - return 0; -} - -static int mount3_export_proc(void *opaque, rpc_op_t *rop) -{ - nfs_client_t *self = (nfs_client_t*)opaque; - nfs_exports *reply = (nfs_exports*)rop->reply; - *reply = (struct nfs_exportnode*)calloc_or_die(1, sizeof(struct nfs_exportnode) + sizeof(struct nfs_groupnode)); - xdr_add_malloc(rop->xdrs, *reply); - (*reply)->ex_dir = xdr_copy_string(rop->xdrs, self->parent->export_root); - (*reply)->ex_groups = (struct nfs_groupnode*)(reply+1); - (*reply)->ex_groups->gr_name = xdr_copy_string(rop->xdrs, "127.0.0.1"); - (*reply)->ex_groups->gr_next = NULL; - (*reply)->ex_next = NULL; - rpc_queue_reply(rop); - return 0; -} - -nfs_client_t::nfs_client_t() -{ - struct rpc_service_proc_t pt[] = { - {NFS_PROGRAM, NFS_V3, NFS3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, this}, - {NFS_PROGRAM, NFS_V3, NFS3_GETATTR, nfs3_getattr_proc, (xdrproc_t)xdr_GETATTR3args, sizeof(GETATTR3args), (xdrproc_t)xdr_GETATTR3res, sizeof(GETATTR3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_SETATTR, nfs3_setattr_proc, (xdrproc_t)xdr_SETATTR3args, sizeof(SETATTR3args), (xdrproc_t)xdr_SETATTR3res, sizeof(SETATTR3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_LOOKUP, nfs3_lookup_proc, (xdrproc_t)xdr_LOOKUP3args, sizeof(LOOKUP3args), (xdrproc_t)xdr_LOOKUP3res, sizeof(LOOKUP3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_ACCESS, nfs3_access_proc, (xdrproc_t)xdr_ACCESS3args, sizeof(ACCESS3args), (xdrproc_t)xdr_ACCESS3res, sizeof(ACCESS3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_READLINK, nfs3_readlink_proc, (xdrproc_t)xdr_READLINK3args, sizeof(READLINK3args), (xdrproc_t)xdr_READLINK3res, sizeof(READLINK3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_READ, nfs3_read_proc, (xdrproc_t)xdr_READ3args, sizeof(READ3args), (xdrproc_t)xdr_READ3res, sizeof(READ3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_WRITE, nfs3_write_proc, (xdrproc_t)xdr_WRITE3args, sizeof(WRITE3args), (xdrproc_t)xdr_WRITE3res, sizeof(WRITE3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_CREATE, nfs3_create_proc, (xdrproc_t)xdr_CREATE3args, sizeof(CREATE3args), (xdrproc_t)xdr_CREATE3res, sizeof(CREATE3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_MKDIR, nfs3_mkdir_proc, (xdrproc_t)xdr_MKDIR3args, sizeof(MKDIR3args), (xdrproc_t)xdr_MKDIR3res, sizeof(MKDIR3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_SYMLINK, nfs3_symlink_proc, (xdrproc_t)xdr_SYMLINK3args, sizeof(SYMLINK3args), (xdrproc_t)xdr_SYMLINK3res, sizeof(SYMLINK3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_MKNOD, nfs3_mknod_proc, (xdrproc_t)xdr_MKNOD3args, sizeof(MKNOD3args), (xdrproc_t)xdr_MKNOD3res, sizeof(MKNOD3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_REMOVE, nfs3_remove_proc, (xdrproc_t)xdr_REMOVE3args, sizeof(REMOVE3args), (xdrproc_t)xdr_REMOVE3res, sizeof(REMOVE3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_RMDIR, nfs3_rmdir_proc, (xdrproc_t)xdr_RMDIR3args, sizeof(RMDIR3args), (xdrproc_t)xdr_RMDIR3res, sizeof(RMDIR3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_RENAME, nfs3_rename_proc, (xdrproc_t)xdr_RENAME3args, sizeof(RENAME3args), (xdrproc_t)xdr_RENAME3res, sizeof(RENAME3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_LINK, nfs3_link_proc, (xdrproc_t)xdr_LINK3args, sizeof(LINK3args), (xdrproc_t)xdr_LINK3res, sizeof(LINK3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_READDIR, nfs3_readdir_proc, (xdrproc_t)xdr_READDIR3args, sizeof(READDIR3args), (xdrproc_t)xdr_READDIR3res, sizeof(READDIR3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS, nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_FSSTAT, nfs3_fsstat_proc, (xdrproc_t)xdr_FSSTAT3args, sizeof(FSSTAT3args), (xdrproc_t)xdr_FSSTAT3res, sizeof(FSSTAT3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_FSINFO, nfs3_fsinfo_proc, (xdrproc_t)xdr_FSINFO3args, sizeof(FSINFO3args), (xdrproc_t)xdr_FSINFO3res, sizeof(FSINFO3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_PATHCONF, nfs3_pathconf_proc, (xdrproc_t)xdr_PATHCONF3args, sizeof(PATHCONF3args), (xdrproc_t)xdr_PATHCONF3res, sizeof(PATHCONF3res), this}, - {NFS_PROGRAM, NFS_V3, NFS3_COMMIT, nfs3_commit_proc, (xdrproc_t)xdr_COMMIT3args, sizeof(COMMIT3args), (xdrproc_t)xdr_COMMIT3res, sizeof(COMMIT3res), this}, - {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, this}, - {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT, mount3_mnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3), this}, - {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP, mount3_dump_proc, NULL, 0, (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist), this}, - {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT, mount3_umnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), NULL, 0, this}, - {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc, NULL, 0, NULL, 0, this}, - {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT, mount3_export_proc, NULL, 0, (xdrproc_t)xdr_nfs_exports, sizeof(nfs_exports), this}, - }; - for (int i = 0; i < sizeof(pt)/sizeof(pt[0]); i++) - { - proc_table.insert(pt[i]); - } -} - -nfs_client_t::~nfs_client_t() -{ -} diff --git a/src/nfs_kv.cpp b/src/nfs_kv.cpp new file mode 100644 index 00000000..ae418677 --- /dev/null +++ b/src/nfs_kv.cpp @@ -0,0 +1,144 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - common functions + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +nfsstat3 vitastor_nfs_map_err(int err) +{ + return (err == EINVAL ? NFS3ERR_INVAL + : (err == ENOENT ? NFS3ERR_NOENT + : (err == ENOSPC ? NFS3ERR_NOSPC + : (err == EEXIST ? NFS3ERR_EXIST + : (err == EIO ? NFS3ERR_IO : (err ? NFS3ERR_IO : NFS3_OK)))))); +} + +nfstime3 nfstime_from_str(const std::string & s) +{ + nfstime3 t; + auto p = s.find("."); + if (p != std::string::npos) + { + t.seconds = stoull_full(s.substr(0, p), 10); + t.nseconds = stoull_full(s.substr(p+1), 10); + p = s.size()-p-1; + for (; p < 9; p++) + t.nseconds *= 10; + for (; p > 9; p--) + t.nseconds /= 10; + } + else + t.seconds = stoull_full(s, 10); + return t; +} + +std::string nfstime_to_str(nfstime3 t) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%u.%09u", t.seconds, t.nseconds); + return buf; +} + +int kv_map_type(const std::string & type) +{ + return (type == "" || type == "file" ? NF3REG : + (type == "dir" ? NF3DIR : + (type == "blk" ? NF3BLK : + (type == "chr" ? NF3CHR : + (type == "link" ? NF3LNK : + (type == "sock" ? NF3SOCK : + (type == "fifo" ? NF3FIFO : -1))))))); +} + +fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs) +{ + auto type = kv_map_type(attrs["type"].string_value()); + auto mode = attrs["mode"].uint64_value(); + auto nlink = attrs["nlink"].uint64_value(); + nfstime3 mtime = nfstime_from_str(attrs["mtime"].string_value()); + nfstime3 atime = attrs["atime"].is_null() ? mtime : nfstime_from_str(attrs["atime"].string_value()); + // FIXME In theory we could store the binary structure itself instead of JSON + return (fattr3){ + .type = (type == 0 ? NF3REG : (ftype3)type), + .mode = (attrs["mode"].is_null() ? (type == NF3DIR ? 0755 : 0644) : (uint32_t)mode), + .nlink = (nlink == 0 ? 1 : (uint32_t)nlink), + .uid = (uint32_t)attrs["uid"].uint64_value(), + .gid = (uint32_t)attrs["gid"].uint64_value(), + .size = (type == NF3DIR ? 4096 : attrs["size"].uint64_value()), + .used = (type == NF3DIR ? 4096 : attrs["alloc"].uint64_value()), + .rdev = (type == NF3BLK || type == NF3CHR + ? (specdata3){ (uint32_t)attrs["major"].uint64_value(), (uint32_t)attrs["minor"].uint64_value() } + : (specdata3){}), + .fsid = self->parent->fsid, + .fileid = ino, + .atime = atime, + .mtime = mtime, + .ctime = mtime, + }; +} + +std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename) +{ + // encode as: d / + char key[24] = { 0 }; + snprintf(key, sizeof(key), "d-%jx/", dir_ino); + int n = strnlen(key, sizeof(key)-1) - 3; + if (n < 10) + key[1] = '0'+n; + else + key[1] = 'A'+(n-10); + return (char*)key + filename; +} + +std::string kv_direntry_filename(const std::string & key) +{ + // decode as: d / + auto pos = key.find("/"); + if (pos != std::string::npos) + return key.substr(pos+1); + return key; +} + +std::string kv_inode_key(uint64_t ino) +{ + char key[24] = { 0 }; + snprintf(key, sizeof(key), "i-%jx", ino); + int n = strnlen(key, sizeof(key)-1) - 2; + if (n < 10) + key[1] = '0'+n; + else + key[1] = 'A'+(n-10); + return std::string(key, n+2); +} + +std::string kv_fh(uint64_t ino) +{ + return "S"+std::string((char*)&ino, 8); +} + +uint64_t kv_fh_inode(const std::string & fh) +{ + if (fh.size() == 1 && fh[0] == 'R') + { + return 1; + } + else if (fh.size() == 9 && fh[0] == 'S') + { + return *(uint64_t*)&fh[1]; + } + else if (fh.size() > 17 && fh[0] == 'I') + { + return *(uint64_t*)&fh[fh.size()-8]; + } + return 0; +} + +bool kv_fh_valid(const std::string & fh) +{ + return fh == KV_ROOT_HANDLE || fh.size() == 9 && fh[0] == 'S' || fh.size() > 17 && fh[0] == 'I'; +} diff --git a/src/nfs_kv_create.cpp b/src/nfs_kv_create.cpp new file mode 100644 index 00000000..605d39ad --- /dev/null +++ b/src/nfs_kv_create.cpp @@ -0,0 +1,328 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - CREATE, MKDIR, SYMLINK, MKNOD + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +void allocate_new_id(nfs_client_t *self, std::function cb) +{ + if (self->parent->fs_next_id <= self->parent->fs_allocated_id) + { + cb(0, self->parent->fs_next_id++); + return; + } + else if (self->parent->fs_next_id > self->parent->fs_inode_count) + { + cb(-ENOSPC, 0); + return; + } + self->parent->db->get(KV_NEXT_ID_KEY, [=](int res, const std::string & prev_str) + { + if (res < 0 && res != -ENOENT) + { + cb(res, 0); + return; + } + uint64_t prev_val = stoull_full(prev_str); + if (prev_val >= self->parent->fs_inode_count) + { + cb(-ENOSPC, 0); + return; + } + if (prev_val < 1) + { + prev_val = 1; + } + uint64_t new_val = prev_val + self->parent->id_alloc_batch_size; + if (new_val >= self->parent->fs_inode_count) + { + new_val = self->parent->fs_inode_count; + } + self->parent->db->set(KV_NEXT_ID_KEY, std::to_string(new_val), [=](int res) + { + if (res == -EAGAIN) + { + // CAS failure - retry + allocate_new_id(self, cb); + } + else if (res < 0) + { + cb(res, 0); + } + else + { + self->parent->fs_next_id = prev_val+2; + self->parent->fs_allocated_id = new_val; + cb(0, prev_val+1); + } + }, [prev_val](int res, const std::string & value) + { + // FIXME: Allow to modify value from CAS callback? ("update" query) + return res < 0 || stoull_full(value) == prev_val; + }); + }); +} + +struct kv_create_state +{ + nfs_client_t *self = NULL; + rpc_op_t *rop = NULL; + bool exclusive = false; + uint64_t verf = 0; + uint64_t dir_ino = 0; + std::string filename; + uint64_t new_id = 0; + json11::Json::object attrobj; + json11::Json attrs; + std::string direntry_text; + uint64_t dup_ino = 0; + std::function cb; +}; + +static void kv_do_create(kv_create_state *st) +{ + if (st->self->parent->trace) + fprintf(stderr, "[%d] CREATE %ju/%s ATTRS %s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str(), json11::Json(st->attrobj).dump().c_str()); + if (st->filename == "" || st->filename.find("/") != std::string::npos) + { + auto cb = std::move(st->cb); + cb(-EINVAL); + return; + } + // Generate inode ID + allocate_new_id(st->self, [st](int res, uint64_t new_id) + { + if (res < 0) + { + auto cb = std::move(st->cb); + cb(res); + return; + } + st->new_id = new_id; + auto direntry = json11::Json::object{ { "ino", st->new_id } }; + if (st->attrobj["type"].string_value() == "dir") + { + direntry["type"] = "dir"; + } + st->attrs = std::move(st->attrobj); + st->direntry_text = json11::Json(direntry).dump().c_str(); + // Set direntry + st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename), st->direntry_text, [st](int res) + { + if (res < 0) + { + st->self->parent->unallocated_ids.push_back(st->new_id); + if (res == -EAGAIN) + { + if (st->dup_ino) + { + st->new_id = st->dup_ino; + res = 0; + } + else + res = -EEXIST; + } + else + fprintf(stderr, "create %ju/%s failed: %s (code %d)\n", st->dir_ino, st->filename.c_str(), strerror(-res), res); + auto cb = std::move(st->cb); + cb(res); + } + else + { + st->self->parent->db->set(kv_inode_key(st->new_id), st->attrs.dump().c_str(), [st](int res) + { + if (res == -EAGAIN) + { + res = -EEXIST; + } + if (res < 0) + { + st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st, res](int del_res) + { + if (!del_res) + { + st->self->parent->unallocated_ids.push_back(st->new_id); + } + auto cb = std::move(st->cb); + cb(res); + }, [st](int res, const std::string & value) + { + return res != -ENOENT && value == st->direntry_text; + }); + } + else + { + auto cb = std::move(st->cb); + cb(0); + } + }, [st](int res, const std::string & value) + { + return res == -ENOENT; + }); + } + }, [st](int res, const std::string & value) + { + // CAS compare - check that the key doesn't exist + if (res == 0) + { + std::string err; + auto direntry = json11::Json::parse(value, err); + if (err != "") + { + fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, overwriting\n", + kv_direntry_key(st->dir_ino, st->filename).c_str(), value.c_str(), err.c_str()); + return true; + } + if (st->exclusive && direntry["verf"].uint64_value() == st->verf) + { + st->dup_ino = direntry["ino"].uint64_value(); + return false; + } + return false; + } + return true; + }); + }); +} + +static void kv_create_setattr(json11::Json::object & attrobj, sattr3 & sattr) +{ + if (sattr.mode.set_it) + attrobj["mode"] = (uint64_t)sattr.mode.mode; + if (sattr.uid.set_it) + attrobj["uid"] = (uint64_t)sattr.uid.uid; + if (sattr.gid.set_it) + attrobj["gid"] = (uint64_t)sattr.gid.gid; + if (sattr.atime.set_it) + attrobj["atime"] = nfstime_to_str(sattr.atime.atime); + if (sattr.mtime.set_it) + attrobj["mtime"] = nfstime_to_str(sattr.mtime.mtime); +} + +template static void kv_create_reply(kv_create_state *st, int res) +{ + T *reply = (T*)st->rop->reply; + if (res < 0) + { + *reply = (T){ .status = vitastor_nfs_map_err(-res) }; + } + else + { + *reply = (T){ + .status = NFS3_OK, + .resok = (Tok){ + .obj = { + .handle_follows = 1, + .handle = xdr_copy_string(st->rop->xdrs, kv_fh(st->new_id)), + }, + .obj_attributes = { + .attributes_follow = 1, + .attributes = get_kv_attributes(st->self, st->new_id, st->attrs), + }, + }, + }; + } + rpc_queue_reply(st->rop); + delete st; +} + +int kv_nfs3_create_proc(void *opaque, rpc_op_t *rop) +{ + kv_create_state *st = new kv_create_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + auto args = (CREATE3args*)rop->request; + st->exclusive = args->how.mode == NFS_EXCLUSIVE; + st->verf = st->exclusive ? *(uint64_t*)&args->how.verf : 0; + st->dir_ino = kv_fh_inode(args->where.dir); + st->filename = args->where.name; + if (args->how.mode == NFS_EXCLUSIVE) + { + st->attrobj["verf"] = *(uint64_t*)&args->how.verf; + } + else if (args->how.mode == NFS_UNCHECKED) + { + kv_create_setattr(st->attrobj, args->how.obj_attributes); + if (args->how.obj_attributes.size.set_it) + { + st->attrobj["size"] = (uint64_t)args->how.obj_attributes.size.size; + st->attrobj["empty"] = true; + } + } + st->cb = [st](int res) { kv_create_reply(st, res); }; + kv_do_create(st); + return 1; +} + +int kv_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop) +{ + kv_create_state *st = new kv_create_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + auto args = (MKDIR3args*)rop->request; + st->dir_ino = kv_fh_inode(args->where.dir); + st->filename = args->where.name; + st->attrobj["type"] = "dir"; + st->attrobj["parent_ino"] = st->dir_ino; + kv_create_setattr(st->attrobj, args->attributes); + st->cb = [st](int res) { kv_create_reply(st, res); }; + kv_do_create(st); + return 1; +} + +int kv_nfs3_symlink_proc(void *opaque, rpc_op_t *rop) +{ + kv_create_state *st = new kv_create_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + auto args = (SYMLINK3args*)rop->request; + st->dir_ino = kv_fh_inode(args->where.dir); + st->filename = args->where.name; + st->attrobj["type"] = "link"; + st->attrobj["symlink"] = (std::string)args->symlink.symlink_data; + kv_create_setattr(st->attrobj, args->symlink.symlink_attributes); + st->cb = [st](int res) { kv_create_reply(st, res); }; + kv_do_create(st); + return 1; +} + +int kv_nfs3_mknod_proc(void *opaque, rpc_op_t *rop) +{ + kv_create_state *st = new kv_create_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + auto args = (MKNOD3args*)rop->request; + st->dir_ino = kv_fh_inode(args->where.dir); + st->filename = args->where.name; + if (args->what.type == NF3CHR || args->what.type == NF3BLK) + { + st->attrobj["type"] = (args->what.type == NF3CHR ? "chr" : "blk"); + st->attrobj["major"] = (uint64_t)args->what.chr_device.spec.specdata1; + st->attrobj["minor"] = (uint64_t)args->what.chr_device.spec.specdata2; + kv_create_setattr(st->attrobj, args->what.chr_device.dev_attributes); + } + else if (args->what.type == NF3SOCK || args->what.type == NF3FIFO) + { + st->attrobj["type"] = (args->what.type == NF3SOCK ? "sock" : "fifo"); + kv_create_setattr(st->attrobj, args->what.sock_attributes); + } + else + { + *(MKNOD3res*)rop->reply = (MKNOD3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + delete st; + return 0; + } + st->cb = [st](int res) { kv_create_reply(st, res); }; + kv_do_create(st); + return 1; +} diff --git a/src/nfs_kv_fsstat.cpp b/src/nfs_kv_fsstat.cpp new file mode 100644 index 00000000..629955bf --- /dev/null +++ b/src/nfs_kv_fsstat.cpp @@ -0,0 +1,171 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - FSSTAT, FSINFO, PATHCONF + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +// Get file system statistics +int kv_nfs3_fsstat_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + //FSSTAT3args *args = (FSSTAT3args*)rop->request; + if (self->parent->trace) + fprintf(stderr, "[%d] FSSTAT\n", self->nfs_fd); + FSSTAT3res *reply = (FSSTAT3res*)rop->reply; + uint64_t tbytes = 0, fbytes = 0; + auto pst_it = self->parent->pool_stats.find(self->parent->default_pool_id); + if (pst_it != self->parent->pool_stats.end()) + { + auto ttb = pst_it->second["total_raw_tb"].number_value(); + auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value()); + tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40); + fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40); + } + *reply = (FSSTAT3res){ + .status = NFS3_OK, + .resok = (FSSTAT3resok){ + .obj_attributes = { + .attributes_follow = 0, + //.attributes = get_root_attributes(self), + }, + .tbytes = tbytes, // total bytes + .fbytes = fbytes, // free bytes + .abytes = fbytes, // available bytes + .tfiles = (size3)(1 << 31), // maximum total files + .ffiles = (size3)(1 << 31), // free files + .afiles = (size3)(1 << 31), // available files + .invarsec = 0, + }, + }; + rpc_queue_reply(rop); + return 0; +} + +int kv_nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + FSINFO3args *args = (FSINFO3args*)rop->request; + FSINFO3res *reply = (FSINFO3res*)rop->reply; + if (self->parent->trace) + fprintf(stderr, "[%d] FSINFO %s\n", self->nfs_fd, std::string(args->fsroot).c_str()); + if (args->fsroot != KV_ROOT_HANDLE) + { + *reply = (FSINFO3res){ .status = NFS3ERR_INVAL }; + } + else + { + // Fill info + *reply = (FSINFO3res){ + .status = NFS3_OK, + .resok = (FSINFO3resok){ + .obj_attributes = { + .attributes_follow = 0, + //.attributes = get_root_attributes(self), + }, + .rtmax = 128*1024*1024, + .rtpref = 128*1024*1024, + .rtmult = 4096, + .wtmax = 128*1024*1024, + .wtpref = 128*1024*1024, + .wtmult = 4096, + .dtpref = 128, + .maxfilesize = 0x7fffffffffffffff, + .time_delta = { + .seconds = 1, + .nseconds = 0, + }, + .properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS, + }, + }; + } + rpc_queue_reply(rop); + return 0; +} + +int kv_nfs3_pathconf_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + PATHCONF3args *args = (PATHCONF3args*)rop->request; + PATHCONF3res *reply = (PATHCONF3res*)rop->reply; + if (self->parent->trace) + fprintf(stderr, "[%d] PATHCONF %s\n", self->nfs_fd, std::string(args->object).c_str()); + if (args->object != KV_ROOT_HANDLE) + { + *reply = (PATHCONF3res){ .status = NFS3ERR_INVAL }; + } + else + { + // Fill info + *reply = (PATHCONF3res){ + .status = NFS3_OK, + .resok = (PATHCONF3resok){ + .obj_attributes = { + // Without at least one reference to a non-constant value (local variable or something else), + // with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here + // FIXME: get rid of this after raising compiler requirement + .attributes_follow = 0, + //.attributes = get_root_attributes(self), + }, + .linkmax = 0, + .name_max = 255, + .no_trunc = TRUE, + .chown_restricted = FALSE, + .case_insensitive = FALSE, + .case_preserving = TRUE, + }, + }; + } + rpc_queue_reply(rop); + return 0; +} + +nfs_client_t::nfs_client_t() +{ + struct rpc_service_proc_t pt[] = { + {NFS_PROGRAM, NFS_V3, NFS3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, this}, + {NFS_PROGRAM, NFS_V3, NFS3_GETATTR, kv_nfs3_getattr_proc, (xdrproc_t)xdr_GETATTR3args, sizeof(GETATTR3args), (xdrproc_t)xdr_GETATTR3res, sizeof(GETATTR3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_SETATTR, kv_nfs3_setattr_proc, (xdrproc_t)xdr_SETATTR3args, sizeof(SETATTR3args), (xdrproc_t)xdr_SETATTR3res, sizeof(SETATTR3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_LOOKUP, kv_nfs3_lookup_proc, (xdrproc_t)xdr_LOOKUP3args, sizeof(LOOKUP3args), (xdrproc_t)xdr_LOOKUP3res, sizeof(LOOKUP3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_ACCESS, nfs3_access_proc, (xdrproc_t)xdr_ACCESS3args, sizeof(ACCESS3args), (xdrproc_t)xdr_ACCESS3res, sizeof(ACCESS3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_READLINK, kv_nfs3_readlink_proc, (xdrproc_t)xdr_READLINK3args, sizeof(READLINK3args), (xdrproc_t)xdr_READLINK3res, sizeof(READLINK3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_READ, kv_nfs3_read_proc, (xdrproc_t)xdr_READ3args, sizeof(READ3args), (xdrproc_t)xdr_READ3res, sizeof(READ3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_WRITE, kv_nfs3_write_proc, (xdrproc_t)xdr_WRITE3args, sizeof(WRITE3args), (xdrproc_t)xdr_WRITE3res, sizeof(WRITE3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_CREATE, kv_nfs3_create_proc, (xdrproc_t)xdr_CREATE3args, sizeof(CREATE3args), (xdrproc_t)xdr_CREATE3res, sizeof(CREATE3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_MKDIR, kv_nfs3_mkdir_proc, (xdrproc_t)xdr_MKDIR3args, sizeof(MKDIR3args), (xdrproc_t)xdr_MKDIR3res, sizeof(MKDIR3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_SYMLINK, kv_nfs3_symlink_proc, (xdrproc_t)xdr_SYMLINK3args, sizeof(SYMLINK3args), (xdrproc_t)xdr_SYMLINK3res, sizeof(SYMLINK3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_MKNOD, kv_nfs3_mknod_proc, (xdrproc_t)xdr_MKNOD3args, sizeof(MKNOD3args), (xdrproc_t)xdr_MKNOD3res, sizeof(MKNOD3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_REMOVE, kv_nfs3_remove_proc, (xdrproc_t)xdr_REMOVE3args, sizeof(REMOVE3args), (xdrproc_t)xdr_REMOVE3res, sizeof(REMOVE3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_RMDIR, kv_nfs3_rmdir_proc, (xdrproc_t)xdr_RMDIR3args, sizeof(RMDIR3args), (xdrproc_t)xdr_RMDIR3res, sizeof(RMDIR3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_RENAME, kv_nfs3_rename_proc, (xdrproc_t)xdr_RENAME3args, sizeof(RENAME3args), (xdrproc_t)xdr_RENAME3res, sizeof(RENAME3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_LINK, kv_nfs3_link_proc, (xdrproc_t)xdr_LINK3args, sizeof(LINK3args), (xdrproc_t)xdr_LINK3res, sizeof(LINK3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_READDIR, kv_nfs3_readdir_proc, (xdrproc_t)xdr_READDIR3args, sizeof(READDIR3args), (xdrproc_t)xdr_READDIR3res, sizeof(READDIR3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS, kv_nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_FSSTAT, kv_nfs3_fsstat_proc, (xdrproc_t)xdr_FSSTAT3args, sizeof(FSSTAT3args), (xdrproc_t)xdr_FSSTAT3res, sizeof(FSSTAT3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_FSINFO, kv_nfs3_fsinfo_proc, (xdrproc_t)xdr_FSINFO3args, sizeof(FSINFO3args), (xdrproc_t)xdr_FSINFO3res, sizeof(FSINFO3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_PATHCONF, kv_nfs3_pathconf_proc, (xdrproc_t)xdr_PATHCONF3args, sizeof(PATHCONF3args), (xdrproc_t)xdr_PATHCONF3res, sizeof(PATHCONF3res), this}, + {NFS_PROGRAM, NFS_V3, NFS3_COMMIT, nfs3_commit_proc, (xdrproc_t)xdr_COMMIT3args, sizeof(COMMIT3args), (xdrproc_t)xdr_COMMIT3res, sizeof(COMMIT3res), this}, + {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, this}, + {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT, mount3_mnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3), this}, + {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP, mount3_dump_proc, NULL, 0, (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist), this}, + {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT, mount3_umnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), NULL, 0, this}, + {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc, NULL, 0, NULL, 0, this}, + {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT, mount3_export_proc, NULL, 0, (xdrproc_t)xdr_nfs_exports, sizeof(nfs_exports), this}, + }; + for (int i = 0; i < sizeof(pt)/sizeof(pt[0]); i++) + { + proc_table.insert(pt[i]); + } +} + +nfs_client_t::~nfs_client_t() +{ +} diff --git a/src/nfs_kv_getattr.cpp b/src/nfs_kv_getattr.cpp new file mode 100644 index 00000000..dc5272e6 --- /dev/null +++ b/src/nfs_kv_getattr.cpp @@ -0,0 +1,81 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - GETATTR + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +// Attributes are always stored in the inode +void kv_read_inode(nfs_client_t *self, uint64_t ino, + std::function cb, + bool allow_cache) +{ + auto key = kv_inode_key(ino); + self->parent->db->get(key, [=](int res, const std::string & value) + { + if (ino == KV_ROOT_INODE && res == -ENOENT) + { + // Allow root inode to not exist + cb(0, "", json11::Json(json11::Json::object{ { "type", "dir" } })); + return; + } + if (res < 0) + { + if (res != -ENOENT) + fprintf(stderr, "Error reading inode %s: %s (code %d)\n", kv_inode_key(ino).c_str(), strerror(-res), res); + cb(res, "", json11::Json()); + return; + } + std::string err; + auto attrs = json11::Json::parse(value, err); + if (err != "") + { + fprintf(stderr, "Invalid JSON in inode %s = %s: %s\n", kv_inode_key(ino).c_str(), value.c_str(), err.c_str()); + res = -EIO; + } + cb(res, value, attrs); + }, allow_cache); +} + +int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + GETATTR3args *args = (GETATTR3args*)rop->request; + GETATTR3res *reply = (GETATTR3res*)rop->reply; + std::string fh = args->object; + auto ino = kv_fh_inode(fh); + if (self->parent->trace) + fprintf(stderr, "[%d] GETATTR %ju\n", self->nfs_fd, ino); + if (!kv_fh_valid(fh)) + { + *reply = (GETATTR3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + return 0; + } + kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json attrs) + { + if (res < 0) + { + *reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) }; + } + else + { + *reply = (GETATTR3res){ + .status = NFS3_OK, + .resok = (GETATTR3resok){ + .obj_attributes = get_kv_attributes(self, ino, attrs), + }, + }; + } + rpc_queue_reply(rop); + }); + return 1; +} diff --git a/src/nfs_kv_link.cpp b/src/nfs_kv_link.cpp new file mode 100644 index 00000000..0b349019 --- /dev/null +++ b/src/nfs_kv_link.cpp @@ -0,0 +1,194 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - LINK + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +struct nfs_kv_link_state +{ + nfs_client_t *self = NULL; + rpc_op_t *rop = NULL; + uint64_t ino = 0; + uint64_t dir_ino = 0; + std::string filename; + std::string ientry_text; + json11::Json ientry; + bool retrying = false; + int wait = 0; + int res = 0, res2 = 0; + std::function cb; +}; + +static void nfs_kv_continue_link(nfs_kv_link_state *st, int state) +{ + // 1) Read the source inode + // 2) If it's a directory - fail with -EISDIR + // 3) Create the new direntry with the same inode reference + // 4) Update the inode entry with refcount++ + // 5) Retry update if CAS failed but the inode exists + // 6) Otherwise fail and remove the new direntry + // Yeah we may leave a bad direntry if we crash + // But the other option is to possibly leave an inode with too big refcount + if (state == 0) {} + else if (state == 1) goto resume_1; + else if (state == 2) goto resume_2; + else if (state == 3) goto resume_3; + else if (state == 4) goto resume_4; + else + { + fprintf(stderr, "BUG: invalid state in nfs_kv_continue_link()"); + abort(); + } +resume_0: + // Check that the source inode exists and is not a directory + st->wait = st->retrying ? 1 : 2; + st->res2 = 0; + kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs) + { + st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? -EISDIR : 0) : res; + st->ientry_text = value; + st->ientry = attrs; + if (!--st->wait) + nfs_kv_continue_link(st, 1); + }); + if (!st->retrying) + { + // Check that the new directory exists + kv_read_inode(st->self, st->dir_ino, [st](int res, const std::string & value, json11::Json attrs) + { + st->res2 = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res; + if (!--st->wait) + nfs_kv_continue_link(st, 1); + }); + } + return; +resume_1: + if (st->res < 0 || st->res2 < 0) + { + auto cb = std::move(st->cb); + cb(st->res < 0 ? st->res : st->res2); + return; + } + // Write the new direntry + if (!st->retrying) + { + st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename), + json11::Json(json11::Json::object{ { "ino", st->ino } }).dump(), [st](int res) + { + st->res = res; + nfs_kv_continue_link(st, 2); + }, [st](int res, const std::string & old_value) + { + return res == -ENOENT; + }); + return; +resume_2: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + } + // Increase inode refcount + { + auto new_ientry = st->ientry.object_items(); + auto nlink = new_ientry["nlink"].uint64_value(); + new_ientry["nlink"] = nlink ? nlink+1 : 2; + st->ientry = new_ientry; + } + st->self->parent->db->set(kv_inode_key(st->ino), st->ientry.dump(), [st](int res) + { + st->res = res; + nfs_kv_continue_link(st, 3); + }, [st](int res, const std::string & old_value) + { + st->res2 = res; + return res == 0 && old_value == st->ientry_text; + }); + return; +resume_3: + if (st->res2 == -ENOENT) + { + st->res = -ENOENT; + } + if (st->res == -EAGAIN) + { + // Re-read inode and retry + st->retrying = true; + goto resume_0; + } + if (st->res < 0) + { + // Maybe inode was deleted in the meantime, delete our direntry + st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res) + { + st->res2 = res; + nfs_kv_continue_link(st, 4); + }); + return; +resume_4: + if (st->res2 < 0) + { + fprintf(stderr, "Warning: failed to delete new linked direntry %ju/%s: %s (code %d)\n", + st->dir_ino, st->filename.c_str(), strerror(-st->res2), st->res2); + } + } + auto cb = std::move(st->cb); + cb(st->res); +} + +// FIXME: We'll need some tests for the FS +int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop) +{ + auto st = new nfs_kv_link_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + LINK3args *args = (LINK3args*)rop->request; + st->ino = kv_fh_inode(args->file); + st->dir_ino = kv_fh_inode(args->link.dir); + st->filename = args->link.name; + if (st->self->parent->trace) + fprintf(stderr, "[%d] LINK %ju -> %ju/%s\n", st->self->nfs_fd, st->ino, st->dir_ino, st->filename.c_str()); + if (!st->ino || !st->dir_ino || st->filename == "") + { + LINK3res *reply = (LINK3res*)rop->reply; + *reply = (LINK3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + delete st; + return 0; + } + st->cb = [st](int res) + { + LINK3res *reply = (LINK3res*)st->rop->reply; + if (res < 0) + { + *reply = (LINK3res){ .status = vitastor_nfs_map_err(res) }; + } + else + { + *reply = (LINK3res){ + .status = NFS3_OK, + .resok = (LINK3resok){ + .file_attributes = (post_op_attr){ + .attributes_follow = 1, + .attributes = get_kv_attributes(st->self, st->ino, st->ientry), + }, + }, + }; + } + rpc_queue_reply(st->rop); + delete st; + }; + nfs_kv_continue_link(st, 0); + return 1; +} diff --git a/src/nfs_kv_lookup.cpp b/src/nfs_kv_lookup.cpp new file mode 100644 index 00000000..4e44279b --- /dev/null +++ b/src/nfs_kv_lookup.cpp @@ -0,0 +1,109 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - LOOKUP, READLINK + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + LOOKUP3args *args = (LOOKUP3args*)rop->request; + LOOKUP3res *reply = (LOOKUP3res*)rop->reply; + inode_t dir_ino = kv_fh_inode(args->what.dir); + std::string filename = args->what.name; + if (self->parent->trace) + fprintf(stderr, "[%d] LOOKUP %ju/%s\n", self->nfs_fd, dir_ino, filename.c_str()); + if (!dir_ino || filename == "") + { + *reply = (LOOKUP3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + return 0; + } + self->parent->db->get(kv_direntry_key(dir_ino, filename), [=](int res, const std::string & value) + { + if (res < 0) + { + *reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(-res) }; + rpc_queue_reply(rop); + return; + } + std::string err; + auto direntry = json11::Json::parse(value, err); + if (err != "") + { + fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n", kv_direntry_key(dir_ino, filename).c_str(), value.c_str(), err.c_str()); + *reply = (LOOKUP3res){ .status = NFS3ERR_IO }; + rpc_queue_reply(rop); + return; + } + uint64_t ino = direntry["ino"].uint64_value(); + kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json ientry) + { + if (res < 0) + { + *reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(res == -ENOENT ? -EIO : res) }; + rpc_queue_reply(rop); + return; + } + *reply = (LOOKUP3res){ + .status = NFS3_OK, + .resok = (LOOKUP3resok){ + .object = xdr_copy_string(rop->xdrs, kv_fh(ino)), + .obj_attributes = { + .attributes_follow = 1, + .attributes = get_kv_attributes(self, ino, ientry), + }, + }, + }; + rpc_queue_reply(rop); + }); + }); + return 1; +} + +int kv_nfs3_readlink_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + READLINK3args *args = (READLINK3args*)rop->request; + if (self->parent->trace) + fprintf(stderr, "[%d] READLINK %ju\n", self->nfs_fd, kv_fh_inode(args->symlink)); + READLINK3res *reply = (READLINK3res*)rop->reply; + if (!kv_fh_valid(args->symlink) || args->symlink == KV_ROOT_HANDLE) + { + // Invalid filehandle or trying to read symlink from root entry + *reply = (READLINK3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + return 0; + } + kv_read_inode(self, kv_fh_inode(args->symlink), [=](int res, const std::string & value, json11::Json attrs) + { + if (res < 0) + { + *reply = (READLINK3res){ .status = vitastor_nfs_map_err(-res) }; + } + else if (attrs["type"] != "link") + { + *reply = (READLINK3res){ .status = NFS3ERR_INVAL }; + } + else + { + *reply = (READLINK3res){ + .status = NFS3_OK, + .resok = (READLINK3resok){ + .data = xdr_copy_string(rop->xdrs, attrs["symlink"].string_value()), + }, + }; + } + rpc_queue_reply(rop); + }); + return 1; +} diff --git a/src/nfs_kv_read.cpp b/src/nfs_kv_read.cpp new file mode 100644 index 00000000..1b06fad2 --- /dev/null +++ b/src/nfs_kv_read.cpp @@ -0,0 +1,69 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - READ + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + READ3args *args = (READ3args*)rop->request; + READ3res *reply = (READ3res*)rop->reply; + inode_t ino = kv_fh_inode(args->file); + if (args->count > MAX_REQUEST_SIZE || !ino) + { + *reply = (READ3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + return 0; + } + uint64_t alignment = self->parent->cli->st_cli.global_bitmap_granularity; + auto pool_cfg = self->parent->cli->st_cli.pool_config.find(INODE_POOL(self->parent->fs_base_inode)); + if (pool_cfg != self->parent->cli->st_cli.pool_config.end()) + { + alignment = pool_cfg->second.bitmap_granularity; + } + uint64_t aligned_offset = args->offset - (args->offset % alignment); + uint64_t aligned_count = args->offset + args->count; + if (aligned_count % alignment) + aligned_count = aligned_count + alignment - (aligned_count % alignment); + aligned_count -= aligned_offset; + void *buf = malloc_or_die(aligned_count); + xdr_add_malloc(rop->xdrs, buf); + cluster_op_t *op = new cluster_op_t; + op->opcode = OSD_OP_READ; + op->inode = self->parent->fs_base_inode + ino; + op->offset = aligned_offset; + op->len = aligned_count; + op->iov.push_back(buf, aligned_count); + *reply = (READ3res){ .status = NFS3_OK }; + reply->resok.data.data = (char*)buf + args->offset - aligned_offset; + reply->resok.data.size = args->count; + op->callback = [rop](cluster_op_t *op) + { + READ3res *reply = (READ3res*)rop->reply; + if (op->retval != op->len) + { + *reply = (READ3res){ .status = vitastor_nfs_map_err(-op->retval) }; + } + else + { + auto & reply_ok = reply->resok; + // reply_ok.data.data is already set above + reply_ok.count = reply_ok.data.size; + reply_ok.eof = 0; + } + rpc_queue_reply(rop); + delete op; + }; + self->parent->cli->execute(op); + return 1; +} diff --git a/src/nfs_kv_readdir.cpp b/src/nfs_kv_readdir.cpp new file mode 100644 index 00000000..1b9ff7c6 --- /dev/null +++ b/src/nfs_kv_readdir.cpp @@ -0,0 +1,376 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - READDIR, READDIRPLUS + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +static unsigned len_pad4(unsigned len) +{ + return len + (len&3 ? 4-(len&3) : 0); +} + +struct nfs_kv_readdir_state +{ + nfs_client_t *self = NULL; + rpc_op_t *rop = NULL; + // Request: + bool is_plus = false; + uint64_t cookie = 0; + uint64_t cookieverf = 0; + uint64_t dir_ino = 0; + uint64_t maxcount = 0; + std::function cb; + // State: + int res = 0; + std::string prefix, start; + void *list_handle; + uint64_t parent_ino = 0; + std::string ientry_text, parent_ientry_text; + json11::Json ientry, parent_ientry; + std::string cur_key, cur_value; + int reply_size = 0; + int to_skip = 0; + uint64_t offset = 0; + int getattr_running = 0, getattr_cur = 0; + // Result: + bool eof = false; + //uint64_t cookieverf = 0; // same field + std::vector entries; +}; + +static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state); + +static void kv_getattr_next(nfs_kv_readdir_state *st) +{ + while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->readdir_getattr_parallel) + { + auto idx = st->getattr_cur++; + st->getattr_running++; + kv_read_inode(st->self, st->entries[idx].fileid, [st, idx](int res, const std::string & value, json11::Json ientry) + { + if (res == 0) + { + st->entries[idx].name_attributes = (post_op_attr){ + // FIXME: maybe do not read parent attributes and leave them to a GETATTR? + .attributes_follow = 1, + .attributes = get_kv_attributes(st->self, st->entries[idx].fileid, ientry), + }; + } + st->getattr_running--; + kv_getattr_next(st); + if (st->getattr_running == 0 && !st->list_handle) + { + nfs_kv_continue_readdir(st, 4); + } + }); + } +} + +static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state) +{ + if (state == 0) {} + else if (state == 1) goto resume_1; + else if (state == 2) goto resume_2; + else if (state == 3) goto resume_3; + else if (state == 4) goto resume_4; + else + { + fprintf(stderr, "BUG: invalid state in nfs_kv_continue_readdir()"); + abort(); + } + // Limit results based on maximum reply size + // Sadly we have to calculate reply size by hand + // reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes + st->reply_size = 20; + if (st->reply_size > st->maxcount) + { + // Error, too small max reply size + auto cb = std::move(st->cb); + cb(-NFS3ERR_TOOSMALL); + return; + } + // Add . and .. + if (st->cookie <= 1) + { + kv_read_inode(st->self, st->dir_ino, [st](int res, const std::string & value, json11::Json ientry) + { + st->res = res; + st->ientry_text = value; + st->ientry = ientry; + nfs_kv_continue_readdir(st, 1); + }); + return; +resume_1: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + if (st->cookie == 0) + { + auto fh = kv_fh(st->dir_ino); + auto entry_size = 20 + 4/*len_pad4(".")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0); + if (st->reply_size + entry_size > st->maxcount) + { + auto cb = std::move(st->cb); + cb(-NFS3ERR_TOOSMALL); + return; + } + entryplus3 dot = {}; + dot.name = xdr_copy_string(st->rop->xdrs, "."); + dot.fileid = st->dir_ino; + dot.name_attributes = (post_op_attr){ + .attributes_follow = 1, + .attributes = get_kv_attributes(st->self, st->dir_ino, st->ientry), + }; + dot.name_handle = (post_op_fh3){ + .handle_follows = 1, + .handle = xdr_copy_string(st->rop->xdrs, fh), + }; + st->entries.push_back(dot); + st->reply_size += entry_size; + } + st->parent_ino = st->ientry["parent_ino"].uint64_value(); + if (st->parent_ino) + { + kv_read_inode(st->self, st->ientry["parent_ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry) + { + st->res = res; + st->parent_ientry_text = value; + st->parent_ientry = ientry; + nfs_kv_continue_readdir(st, 2); + }); + return; +resume_2: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + } + auto fh = kv_fh(st->parent_ino); + auto entry_size = 20 + 4/*len_pad4("..")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0); + if (st->reply_size + entry_size > st->maxcount) + { + st->eof = false; + auto cb = std::move(st->cb); + cb(0); + return; + } + entryplus3 dotdot = {}; + dotdot.name = xdr_copy_string(st->rop->xdrs, ".."); + dotdot.fileid = st->dir_ino; + dotdot.name_attributes = (post_op_attr){ + // FIXME: maybe do not read parent attributes and leave them to a GETATTR? + .attributes_follow = 1, + .attributes = get_kv_attributes(st->self, + st->parent_ino ? st->parent_ino : st->dir_ino, + st->parent_ino ? st->parent_ientry : st->ientry), + }; + dotdot.name_handle = (post_op_fh3){ + .handle_follows = 1, + .handle = xdr_copy_string(st->rop->xdrs, fh), + }; + st->entries.push_back(dotdot); + st->reply_size += entry_size; + } + st->prefix = kv_direntry_key(st->dir_ino, ""); + st->eof = true; + st->start = st->prefix; + if (st->cookie > 1) + { + auto lc_it = st->self->parent->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie }); + if (lc_it != st->self->parent->list_cookies.end()) + { + st->start = lc_it->second.key; + st->to_skip = 1; + st->offset = st->cookie; + } + else + { + st->to_skip = st->cookie-2; + st->offset = 2; + st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62)); + } + } + else + { + st->to_skip = 0; + st->offset = 2; + st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62)); + } + { + auto lc_it = st->self->parent->list_cookies.lower_bound((list_cookie_t){ st->dir_ino, st->cookieverf, 0 }); + if (lc_it != st->self->parent->list_cookies.end() && + lc_it->first.dir_ino == st->dir_ino && + lc_it->first.cookieverf == st->cookieverf && + lc_it->first.cookie < st->cookie) + { + auto lc_start = lc_it; + while (lc_it != st->self->parent->list_cookies.end() && lc_it->first.cookieverf == st->cookieverf) + { + lc_it++; + } + st->self->parent->list_cookies.erase(lc_start, lc_it); + } + } + st->getattr_cur = st->entries.size(); + st->list_handle = st->self->parent->db->list_start(st->start); + st->self->parent->db->list_next(st->list_handle, [=](int res, const std::string & key, const std::string & value) + { + st->res = res; + st->cur_key = key; + st->cur_value = value; + nfs_kv_continue_readdir(st, 3); + }); + return; + while (st->list_handle) + { + st->self->parent->db->list_next(st->list_handle, NULL); + return; +resume_3: + if (st->res == -ENOENT || st->cur_key.size() > st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix) + { + st->self->parent->db->list_close(st->list_handle); + st->list_handle = NULL; + break; + } + if (st->to_skip > 0) + { + st->to_skip--; + continue; + } + std::string err; + auto direntry = json11::Json::parse(st->cur_value, err); + if (err != "") + { + fprintf(stderr, "readdir: direntry %s contains invalid JSON: %s, skipping\n", + st->cur_key.c_str(), st->cur_value.c_str()); + continue; + } + auto ino = direntry["ino"].uint64_value(); + auto name = kv_direntry_filename(st->cur_key); + auto fh = kv_fh(ino); + // 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes + // 1 entryplus3 is (8+4+(filename_len+3)/4*4+8 + // + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0) + // + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes + auto entry_size = 20 + len_pad4(name.size()) + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0); + if (st->reply_size + entry_size > st->maxcount) + { + st->eof = false; + st->self->parent->db->list_close(st->list_handle); + st->list_handle = NULL; + break; + } + st->reply_size += entry_size; + auto idx = st->entries.size(); + st->entries.push_back((entryplus3){}); + auto entry = &st->entries[idx]; + entry->name = xdr_copy_string(st->rop->xdrs, name); + entry->fileid = ino; + entry->cookie = st->offset++; + st->self->parent->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = entry->name }; + if (st->is_plus) + { + entry->name_handle = (post_op_fh3){ + .handle_follows = 1, + .handle = xdr_copy_string(st->rop->xdrs, fh), + }; + kv_getattr_next(st); + } + st->self->parent->db->list_next(st->list_handle, NULL); + } +resume_4: + while (st->getattr_running > 0) + { + return; + } + void *prev = NULL; + for (int i = 0; i < st->entries.size(); i++) + { + entryplus3 *entry = &st->entries[i]; + if (prev) + { + if (st->is_plus) + ((entryplus3*)prev)->nextentry = entry; + else + ((entry3*)prev)->nextentry = (entry3*)entry; + } + prev = entry; + } + // Send reply + auto cb = std::move(st->cb); + cb(0); +} + +static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus) +{ + auto st = new nfs_kv_readdir_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + st->is_plus = is_plus; + if (st->is_plus) + { + READDIRPLUS3args *args = (READDIRPLUS3args*)rop->request; + st->dir_ino = kv_fh_inode(args->dir); + st->cookie = args->cookie; + st->cookieverf = *((uint64_t*)args->cookieverf); + st->maxcount = args->maxcount; + } + else + { + READDIR3args *args = ((READDIR3args*)rop->request); + st->dir_ino = kv_fh_inode(args->dir); + st->cookie = args->cookie; + st->cookieverf = *((uint64_t*)args->cookieverf); + st->maxcount = args->count; + } + if (st->self->parent->trace) + fprintf(stderr, "[%d] READDIR %ju VERF %jx OFFSET %ju LIMIT %ju\n", st->self->nfs_fd, st->dir_ino, st->cookieverf, st->cookie, st->maxcount); + st->cb = [st](int res) + { + if (st->is_plus) + { + READDIRPLUS3res *reply = (READDIRPLUS3res*)st->rop->reply; + *reply = (READDIRPLUS3res){ .status = vitastor_nfs_map_err(res) }; + *(uint64_t*)(reply->resok.cookieverf) = st->cookieverf; + reply->resok.reply.entries = st->entries.size() ? &st->entries[0] : NULL; + reply->resok.reply.eof = st->eof; + } + else + { + READDIR3res *reply = (READDIR3res*)st->rop->reply; + *reply = (READDIR3res){ .status = vitastor_nfs_map_err(res) }; + *(uint64_t*)(reply->resok.cookieverf) = st->cookieverf; + reply->resok.reply.entries = st->entries.size() ? (entry3*)&st->entries[0] : NULL; + reply->resok.reply.eof = st->eof; + } + rpc_queue_reply(st->rop); + delete st; + }; + nfs_kv_continue_readdir(st, 0); +} + +int kv_nfs3_readdir_proc(void *opaque, rpc_op_t *rop) +{ + nfs3_readdir_common(opaque, rop, false); + return 0; +} + +int kv_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop) +{ + nfs3_readdir_common(opaque, rop, true); + return 0; +} diff --git a/src/nfs_kv_remove.cpp b/src/nfs_kv_remove.cpp new file mode 100644 index 00000000..54c78cfc --- /dev/null +++ b/src/nfs_kv_remove.cpp @@ -0,0 +1,316 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - REMOVE, RMDIR + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +struct kv_del_state +{ + nfs_client_t *self = NULL; + rpc_op_t *rop = NULL; + uint64_t dir_ino = 0; + std::string filename; + uint64_t ino = 0; + void *list_handle = NULL; + std::string prefix, list_key, direntry_text, ientry_text; + json11::Json direntry, ientry; + int type = 0; + bool is_rmdir = false; + bool rm_data = false; + int res = 0, res2 = 0; + std::function cb; +}; + +static void nfs_kv_continue_delete(kv_del_state *st, int state) +{ + // Overall algorithm: + // 1) Get inode attributes and check that it's not a directory (REMOVE) + // 2) Get inode attributes and check that it is a directory (RMDIR) + // 3) Delete direntry with CAS + // 4) Check that the directory didn't contain files (RMDIR) and restore it if it did + // 5) Reduce inode refcount by 1 or delete inode + // 6) If regular file and inode is deleted: delete data + if (state == 0) {} + else if (state == 1) goto resume_1; + else if (state == 2) goto resume_2; + else if (state == 3) goto resume_3; + else if (state == 4) goto resume_4; + else if (state == 5) goto resume_5; + else if (state == 6) goto resume_6; + else if (state == 7) goto resume_7; + else + { + fprintf(stderr, "BUG: invalid state in nfs_kv_continue_delete()"); + abort(); + } + st->self->parent->db->get(kv_direntry_key(st->dir_ino, st->filename), [st](int res, const std::string & value) + { + st->res = res; + st->direntry_text = value; + nfs_kv_continue_delete(st, 1); + }); + return; +resume_1: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + { + std::string err; + st->direntry = json11::Json::parse(st->direntry_text, err); + if (err != "") + { + fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, deleting\n", + kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), err.c_str()); + // Just delete direntry and skip inode + } + else + { + st->ino = st->direntry["ino"].uint64_value(); + } + } + // Get inode + st->self->parent->db->get(kv_inode_key(st->ino), [st](int res, const std::string & value) + { + st->res = res; + st->ientry_text = value; + nfs_kv_continue_delete(st, 2); + }); + return; +resume_2: + if (st->res < 0) + { + fprintf(stderr, "error reading inode %s: %s (code %d)\n", + kv_inode_key(st->ino).c_str(), strerror(-st->res), st->res); + auto cb = std::move(st->cb); + cb(st->res); + return; + } + { + std::string err; + st->ientry = json11::Json::parse(st->ientry_text, err); + if (err != "") + { + fprintf(stderr, "Invalid JSON in inode %s = %s: %s, treating as a regular file\n", + kv_inode_key(st->ino).c_str(), st->ientry_text.c_str(), err.c_str()); + } + } + // (1-2) Check type + st->type = kv_map_type(st->ientry["type"].string_value()); + if (st->type == -1 || st->is_rmdir != (st->type == NF3DIR)) + { + auto cb = std::move(st->cb); + cb(st->is_rmdir ? -ENOTDIR : -EISDIR); + return; + } + // (3) Delete direntry with CAS + st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res) + { + st->res = res; + nfs_kv_continue_delete(st, 3); + }, [st](int res, const std::string & value) + { + return value == st->direntry_text; + }); + return; +resume_3: + if (st->res == -EAGAIN) + { + // CAS failure, restart from the beginning + nfs_kv_continue_delete(st, 0); + return; + } + else if (st->res < 0 && st->res != -ENOENT) + { + fprintf(stderr, "failed to remove direntry %s: %s (code %d)\n", + kv_direntry_key(st->dir_ino, st->filename).c_str(), strerror(-st->res), st->res); + auto cb = std::move(st->cb); + cb(st->res); + return; + } + if (!st->ino) + { + // direntry contained invalid JSON and was deleted, finish + auto cb = std::move(st->cb); + cb(0); + return; + } + if (st->is_rmdir) + { + // (4) Check if directory actually is not empty + st->list_handle = st->self->parent->db->list_start(kv_direntry_key(st->ino, "")); + st->self->parent->db->list_next(st->list_handle, [st](int res, const std::string & key, const std::string & value) + { + st->res = res; + st->list_key = key; + st->self->parent->db->list_close(st->list_handle); + nfs_kv_continue_delete(st, 4); + }); + return; +resume_4: + st->prefix = kv_direntry_key(st->ino, ""); + if (st->res == -ENOENT || st->list_key.size() < st->prefix.size() || st->list_key.substr(0, st->prefix.size()) != st->prefix) + { + // OK, directory is empty + } + else + { + // Not OK, restore direntry + st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res) + { + st->res2 = res; + nfs_kv_continue_delete(st, 5); + }, [st](int res, const std::string & value) + { + return res == -ENOENT; + }); + return; +resume_5: + if (st->res2 < 0) + { + fprintf(stderr, "failed to restore direntry %s (%s): %s (code %d)", + kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), strerror(-st->res2), st->res2); + fprintf(stderr, " - inode %ju may be left as garbage\n", st->ino); + } + if (st->res < 0) + { + fprintf(stderr, "failed to list entries from %s: %s (code %d)\n", + kv_direntry_key(st->ino, "").c_str(), strerror(-st->res), st->res); + } + auto cb = std::move(st->cb); + cb(st->res < 0 ? st->res : -ENOTEMPTY); + return; + } + } + // (5) Reduce inode refcount by 1 or delete inode + if (st->ientry["nlink"].uint64_value() > 1) + { + auto copy = st->ientry.object_items(); + copy["nlink"] = st->ientry["nlink"].uint64_value()-1; + st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(copy).dump(), [st](int res) + { + st->res = res; + nfs_kv_continue_delete(st, 6); + }, [st](int res, const std::string & old_value) + { + return old_value == st->ientry_text; + }); + } + else + { + st->self->parent->db->del(kv_inode_key(st->ino), [st](int res) + { + st->res = res; + nfs_kv_continue_delete(st, 6); + }, [st](int res, const std::string & old_value) + { + return old_value == st->ientry_text; + }); + } + return; +resume_6: + if (st->res < 0) + { + // Assume EAGAIN is OK, maybe someone created a hard link in the meantime + auto cb = std::move(st->cb); + cb(st->res == -EAGAIN ? 0 : st->res); + return; + } + // (6) If regular file and inode is deleted: delete data + if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1) + { + // Remove data + st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { + { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) }, + { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) }, + }), [st](const cli_result_t & r) + { + if (r.err) + { + fprintf(stderr, "Failed to remove inode %jx data: %s (code %d)\n", + st->ino, r.text.c_str(), r.err); + } + st->res = r.err; + nfs_kv_continue_delete(st, 7); + }); + return; +resume_7: + auto cb = std::move(st->cb); + cb(st->res); + return; + } + auto cb = std::move(st->cb); + cb(0); +} + +int kv_nfs3_remove_proc(void *opaque, rpc_op_t *rop) +{ + kv_del_state *st = new kv_del_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + REMOVE3res *reply = (REMOVE3res*)rop->reply; + REMOVE3args *args = (REMOVE3args*)rop->request; + st->dir_ino = kv_fh_inode(args->object.dir); + st->filename = args->object.name; + if (st->self->parent->trace) + fprintf(stderr, "[%d] REMOVE %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str()); + if (!st->dir_ino) + { + *reply = (REMOVE3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + delete st; + return 0; + } + st->cb = [st](int res) + { + *((REMOVE3res*)st->rop->reply) = (REMOVE3res){ + .status = vitastor_nfs_map_err(res), + }; + rpc_queue_reply(st->rop); + delete st; + }; + nfs_kv_continue_delete(st, 0); + return 1; +} + +int kv_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop) +{ + kv_del_state *st = new kv_del_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + RMDIR3args *args = (RMDIR3args*)rop->request; + RMDIR3res *reply = (RMDIR3res*)rop->reply; + st->dir_ino = kv_fh_inode(args->object.dir); + st->filename = args->object.name; + st->is_rmdir = true; + if (st->self->parent->trace) + fprintf(stderr, "[%d] RMDIR %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str()); + if (!st->dir_ino) + { + *reply = (RMDIR3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + delete st; + return 0; + } + st->cb = [st](int res) + { + *((RMDIR3res*)st->rop->reply) = (RMDIR3res){ + .status = vitastor_nfs_map_err(res), + }; + rpc_queue_reply(st->rop); + delete st; + }; + nfs_kv_continue_delete(st, 0); + return 1; +} diff --git a/src/nfs_kv_rename.cpp b/src/nfs_kv_rename.cpp new file mode 100644 index 00000000..a67f3377 --- /dev/null +++ b/src/nfs_kv_rename.cpp @@ -0,0 +1,211 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - RENAME + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +struct nfs_kv_rename_state +{ + nfs_client_t *self = NULL; + rpc_op_t *rop = NULL; + uint64_t old_dir_ino = 0, new_dir_ino = 0; + std::string old_name, new_name; + std::string old_direntry_text; + std::string ientry_text; + json11::Json direntry, ientry; + int res = 0, res2 = 0; + std::function cb; +}; + +static void nfs_kv_continue_rename(nfs_kv_rename_state *st, int state) +{ + // Simplified algorithm (non-atomic and without ENOTDIR/EISDIR): + // 1) Check if the target directory exists + // 2) Delete & save (using CAS) the source direntry + // 3) Write the target direntry using CAS, fail if it already exists + // 4) Restore the source direntry on failure + // Atomic version would require something like a journal + if (state == 0) {} + else if (state == 1) goto resume_1; + else if (state == 2) goto resume_2; + else if (state == 3) goto resume_3; + else if (state == 4) goto resume_4; + else if (state == 5) goto resume_5; + else if (state == 6) goto resume_6; + else + { + fprintf(stderr, "BUG: invalid state in nfs_kv_continue_rename()"); + abort(); + } + kv_read_inode(st->self, st->new_dir_ino, [st](int res, const std::string & value, json11::Json attrs) + { + st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res; + nfs_kv_continue_rename(st, 1); + }); + return; +resume_1: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + // Read & delete the old direntry + st->self->parent->db->del(kv_direntry_key(st->old_dir_ino, st->old_name), [st](int res) + { + st->res = res; + nfs_kv_continue_rename(st, 2); + }, [=](int res, const std::string & old_value) + { + st->res = res; + st->old_direntry_text = old_value; + return true; + }); + return; +resume_2: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + { + std::string err; + st->direntry = json11::Json::parse(st->old_direntry_text, err); + if (err != "") + { + fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n", + kv_direntry_key(st->old_dir_ino, st->old_name).c_str(), + st->old_direntry_text.c_str(), err.c_str()); + } + } + if (st->direntry["type"].string_value() == "dir" && + st->direntry["ino"].uint64_value() != 0 && + st->new_dir_ino != st->old_dir_ino) + { + // Read & check inode + kv_read_inode(st->self, st->direntry["ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry) + { + st->res = res; + st->ientry_text = value; + st->ientry = ientry; + nfs_kv_continue_rename(st, 3); + }); + return; +resume_3: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + // Change parent reference + { + auto ientry_new = st->ientry.object_items(); + ientry_new["parent_ino"] = st->new_dir_ino; + st->self->parent->db->set(kv_inode_key(st->direntry["ino"].uint64_value()), json11::Json(ientry_new).dump(), [st](int res) + { + st->res = res; + nfs_kv_continue_rename(st, 4); + }, [st](int res, const std::string & old_value) + { + return old_value == st->ientry_text; + }); + } + return; +resume_4: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + } + st->self->parent->db->set(kv_direntry_key(st->new_dir_ino, st->new_name), st->old_direntry_text, [st](int res) + { + st->res = res; + nfs_kv_continue_rename(st, 5); + }, [st](int res, const std::string & old_value) + { + return res == -ENOENT; + }); + return; +resume_5: + if (st->res < 0) + { + if (st->res == -EAGAIN) + st->res = -EEXIST; + st->res2 = st->res; + st->self->parent->db->set(kv_direntry_key(st->old_dir_ino, st->old_name), st->old_direntry_text, [st](int res) + { + st->res = res; + nfs_kv_continue_rename(st, 6); + }, [st](int res, const std::string & old_value) + { + return res == -ENOENT; + }); + return; +resume_6: + if (st->res < 0) + { + if (st->res == -EAGAIN) + st->res = -EEXIST; + fprintf(stderr, "error restoring %s = %s after failed rename: %s (code %d)\n", + kv_direntry_key(st->old_dir_ino, st->old_name).c_str(), st->old_direntry_text.c_str(), + strerror(-st->res), st->res); + } + auto cb = std::move(st->cb); + cb(st->res2); + return; + } + auto cb = std::move(st->cb); + cb(st->res); +} + +int kv_nfs3_rename_proc(void *opaque, rpc_op_t *rop) +{ + auto st = new nfs_kv_rename_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + RENAME3args *args = (RENAME3args*)rop->request; + st->old_dir_ino = kv_fh_inode(args->from.dir); + st->new_dir_ino = kv_fh_inode(args->to.dir); + st->old_name = args->from.name; + st->new_name = args->to.name; + if (st->self->parent->trace) + fprintf(stderr, "[%d] RENAME %ju/%s -> %ju/%s\n", st->self->nfs_fd, st->old_dir_ino, st->old_name.c_str(), st->new_dir_ino, st->new_name.c_str()); + if (!st->old_dir_ino || !st->new_dir_ino || st->old_name == "" || st->new_name == "") + { + RENAME3res *reply = (RENAME3res*)rop->reply; + *reply = (RENAME3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + delete st; + return 0; + } + if (st->old_dir_ino == st->new_dir_ino && st->old_name == st->new_name) + { + RENAME3res *reply = (RENAME3res*)rop->reply; + *reply = (RENAME3res){ .status = NFS3_OK }; + rpc_queue_reply(st->rop); + delete st; + return 0; + } + st->cb = [st](int res) + { + RENAME3res *reply = (RENAME3res*)st->rop->reply; + *reply = (RENAME3res){ .status = vitastor_nfs_map_err(res) }; + rpc_queue_reply(st->rop); + delete st; + }; + nfs_kv_continue_rename(st, 0); + return 1; +} diff --git a/src/nfs_kv_setattr.cpp b/src/nfs_kv_setattr.cpp new file mode 100644 index 00000000..e3589499 --- /dev/null +++ b/src/nfs_kv_setattr.cpp @@ -0,0 +1,187 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - SETATTR + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +struct nfs_kv_setattr_state +{ + nfs_client_t *self = NULL; + rpc_op_t *rop = NULL; + uint64_t ino = 0; + uint64_t old_size = 0, new_size = 0; + json11::Json::object set_attrs; + int res = 0, cas_res = 0; + std::string ientry_text; + json11::Json ientry; + json11::Json::object new_attrs; + std::function cb; +}; + +static void nfs_kv_continue_setattr(nfs_kv_setattr_state *st, int state) +{ + if (state == 0) {} + else if (state == 1) goto resume_1; + else if (state == 2) goto resume_2; + else if (state == 3) goto resume_3; + else + { + fprintf(stderr, "BUG: invalid state in nfs_kv_continue_setattr()"); + abort(); + } +resume_0: + kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs) + { + st->res = res; + st->ientry_text = value; + st->ientry = attrs; + nfs_kv_continue_setattr(st, 1); + }); + return; +resume_1: + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + if (st->ientry["type"].string_value() == "link" || + st->ientry["type"].string_value() != "file" && + st->ientry["type"].string_value() != "" && + !st->set_attrs["size"].is_null()) + { + auto cb = std::move(st->cb); + cb(-EINVAL); + return; + } + // Now we can update it + st->new_attrs = st->ientry.object_items(); + st->old_size = st->ientry["size"].uint64_value(); + for (auto & kv: st->set_attrs) + { + if (kv.first == "size") + { + st->new_size = kv.second.uint64_value(); + } + st->new_attrs[kv.first] = kv.second; + } + st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(st->new_attrs).dump(), [st](int res) + { + st->res = res; + nfs_kv_continue_setattr(st, 2); + }, [st](int res, const std::string & cas_value) + { + st->cas_res = res; + return (res == 0 || res == -ENOENT && st->ino == KV_ROOT_INODE) && cas_value == st->ientry_text; + }); + return; +resume_2: + if (st->cas_res == -ENOENT) + { + st->res = -ENOENT; + } + if (st->res == -EAGAIN) + { + // Retry + fprintf(stderr, "CAS failure during setattr, retrying\n"); + goto resume_0; + } + if (st->res < 0) + { + fprintf(stderr, "Failed to update inode %ju: %s (code %d)\n", st->ino, strerror(-st->res), st->res); + auto cb = std::move(st->cb); + cb(st->res); + return; + } + if (!st->set_attrs["size"].is_null() && + st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value()) + { + // Delete extra data when downsizing + st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { + { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) }, + { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) }, + { "min_offset", st->set_attrs["size"].uint64_value() }, + }), [st](const cli_result_t & r) + { + if (r.err) + { + fprintf(stderr, "Failed to truncate inode %ju: %s (code %d)\n", + st->ino, r.text.c_str(), r.err); + } + st->res = r.err; + nfs_kv_continue_setattr(st, 3); + }); + return; + } +resume_3: + auto cb = std::move(st->cb); + cb(0); +} + +int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop) +{ + nfs_kv_setattr_state *st = new nfs_kv_setattr_state; + st->self = (nfs_client_t*)opaque; + st->rop = rop; + auto args = (SETATTR3args*)rop->request; + auto reply = (SETATTR3res*)rop->reply; + std::string fh = args->object; + if (!kv_fh_valid(fh)) + { + *reply = (SETATTR3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + delete st; + return 0; + } + st->ino = kv_fh_inode(fh); + if (args->new_attributes.size.set_it) + st->set_attrs["size"] = args->new_attributes.size.size; + if (args->new_attributes.mode.set_it) + st->set_attrs["mode"] = (uint64_t)args->new_attributes.mode.mode; + if (args->new_attributes.uid.set_it) + st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid; + if (args->new_attributes.gid.set_it) + st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid; + if (args->new_attributes.atime.set_it) + st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime); + if (args->new_attributes.mtime.set_it) + st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime); + fprintf(stderr, "SETATTR %ju ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str()); + st->cb = [st](int res) + { + auto reply = (SETATTR3res*)st->rop->reply; + if (res < 0) + { + *reply = (SETATTR3res){ + .status = vitastor_nfs_map_err(res), + }; + } + else + { + *reply = (SETATTR3res){ + .status = NFS3_OK, + .resok = (SETATTR3resok){ + .obj_wcc = (wcc_data){ + .after = (post_op_attr){ + .attributes_follow = 1, + .attributes = get_kv_attributes(st->self, st->ino, st->new_attrs), + }, + }, + }, + }; + } + rpc_queue_reply(st->rop); + delete st; + }; + nfs_kv_continue_setattr(st, 0); + return 1; +} diff --git a/src/nfs_kv_write.cpp b/src/nfs_kv_write.cpp new file mode 100644 index 00000000..fbcaa5d1 --- /dev/null +++ b/src/nfs_kv_write.cpp @@ -0,0 +1,266 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy over VitastorKV database - WRITE + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf); + +int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + WRITE3args *args = (WRITE3args*)rop->request; + WRITE3res *reply = (WRITE3res*)rop->reply; + inode_t ino = kv_fh_inode(args->file); + if (!ino) + { + *reply = (WRITE3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + return 0; + } + if (args->count > MAX_REQUEST_SIZE) + { + *reply = (WRITE3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + return 0; + } + uint64_t count = args->count > args->data.size ? args->data.size : args->count; + uint64_t alignment = self->parent->cli->st_cli.global_bitmap_granularity; + auto pool_cfg = self->parent->cli->st_cli.pool_config.find(INODE_POOL(self->parent->fs_base_inode)); + if (pool_cfg != self->parent->cli->st_cli.pool_config.end()) + { + alignment = pool_cfg->second.bitmap_granularity; + } + // Pre-fill reply + *reply = (WRITE3res){ + .status = NFS3_OK, + .resok = (WRITE3resok){ + //.file_wcc = ..., + .count = (unsigned)count, + }, + }; + if ((args->offset % alignment) != 0 || (count % alignment) != 0) + { + // Unaligned write, requires read-modify-write + uint64_t aligned_offset = args->offset - (args->offset % alignment); + uint64_t aligned_count = args->offset + args->count; + if (aligned_count % alignment) + aligned_count = aligned_count + alignment - (aligned_count % alignment); + aligned_count -= aligned_offset; + void *buf = malloc_or_die(aligned_count); + xdr_add_malloc(rop->xdrs, buf); + // Read + cluster_op_t *op = new cluster_op_t; + op->opcode = OSD_OP_READ; + op->inode = self->parent->fs_base_inode + ino; + op->offset = aligned_offset; + op->len = aligned_count; + op->iov.push_back(buf, aligned_count); + op->callback = [self, rop, count](cluster_op_t *op) + { + if (op->retval != op->len) + { + WRITE3res *reply = (WRITE3res*)rop->reply; + *reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) }; + rpc_queue_reply(rop); + return; + } + void *buf = op->iov.buf[0].iov_base; + WRITE3args *args = (WRITE3args*)rop->request; + memcpy((uint8_t*)buf + args->offset - op->offset, args->data.data, count); + nfs_resize_write(self, rop, op->inode, args->offset+count, op->offset, op->len, buf); + delete op; + }; + self->parent->cli->execute(op); + } + else + { + nfs_resize_write(self, rop, ino, args->offset+count, args->offset, count, args->data.data); + } + return 1; +} + +static void complete_extend_write(nfs_client_t *self, rpc_op_t *rop, inode_t inode, int res) +{ + WRITE3args *args = (WRITE3args*)rop->request; + WRITE3res *reply = (WRITE3res*)rop->reply; + if (res < 0) + { + *reply = (WRITE3res){ .status = vitastor_nfs_map_err(res) }; + rpc_queue_reply(rop); + return; + } + bool imm = self->parent->cli->get_immediate_commit(inode); + reply->resok.committed = args->stable != UNSTABLE || imm ? FILE_SYNC : UNSTABLE; + *(uint64_t*)reply->resok.verf = self->parent->server_id; + if (args->stable != UNSTABLE && !imm) + { + // Client requested a stable write. Add an fsync + auto op = new cluster_op_t; + op->opcode = OSD_OP_SYNC; + op->callback = [rop](cluster_op_t *op) + { + if (op->retval != 0) + { + WRITE3res *reply = (WRITE3res*)rop->reply; + *reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) }; + } + delete op; + rpc_queue_reply(rop); + }; + self->parent->cli->execute(op); + } + else + { + rpc_queue_reply(rop); + } +} + +static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size, int err) +{ + auto ext_it = self->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 }); + while (ext_it != self->extend_writes.end() && + ext_it->first.inode == inode && + ext_it->first.new_size <= new_size) + { + ext_it->second.resize_res = err; + if (ext_it->second.write_res <= 0) + { + complete_extend_write(self, ext_it->second.rop, inode, ext_it->second.write_res < 0 + ? ext_it->second.write_res : ext_it->second.resize_res); + self->extend_writes.erase(ext_it++); + } + else + ext_it++; + } +} + +static void extend_inode(nfs_client_t *self, uint64_t inode) +{ + // Send an extend request + auto ext = &self->extends[inode]; + self->parent->db->set(kv_inode_key(inode), json11::Json(ext->attrs).dump().c_str(), [=](int res) + { + if (res < 0 && res != -EAGAIN) + { + fprintf(stderr, "Error extending inode %ju to %ju bytes: %s (code %d)\n", inode, ext->cur_extend, strerror(-res), res); + } + if (res == -EAGAIN || ext->next_extend > ext->cur_extend) + { + // Multiple concurrent resize requests received, try to repeat + kv_read_inode(self, inode, [=](int res, const std::string & value, json11::Json attrs) + { + auto new_size = ext->next_extend > ext->cur_extend ? ext->next_extend : ext->cur_extend; + if (res == 0 && attrs["size"].uint64_value() < new_size) + { + ext->old_ientry = value; + ext->attrs = attrs.object_items(); + ext->cur_extend = new_size; + ext->attrs["size"] = new_size; + extend_inode(self, inode); + } + else + { + if (res < 0) + { + fprintf(stderr, "Error extending inode %ju to %ju bytes: %s (code %d)\n", inode, ext->cur_extend, strerror(-res), res); + } + ext->cur_extend = ext->next_extend = 0; + complete_extend_inode(self, inode, attrs["size"].uint64_value(), res); + } + }); + return; + } + auto new_size = ext->cur_extend; + ext->cur_extend = ext->next_extend = 0; + complete_extend_inode(self, inode, new_size, res); + }); +} + +static void nfs_do_write(nfs_client_t *self, std::multimap::iterator ewr_it, + rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf) +{ + cluster_op_t *op = new cluster_op_t; + op->opcode = OSD_OP_WRITE; + op->inode = self->parent->fs_base_inode + inode; + op->offset = offset; + op->len = count; + op->iov.push_back(buf, count); + op->callback = [self, ewr_it, rop](cluster_op_t *op) + { + auto inode = op->inode; + int write_res = op->retval < 0 ? op->retval : (op->retval != op->len ? -ERANGE : 0); + if (ewr_it == self->extend_writes.end()) + { + complete_extend_write(self, rop, inode, write_res); + } + else + { + ewr_it->second.write_res = write_res; + if (ewr_it->second.resize_res <= 0) + { + complete_extend_write(self, rop, inode, write_res < 0 ? write_res : ewr_it->second.resize_res); + self->extend_writes.erase(ewr_it); + } + } + }; + self->parent->cli->execute(op); +} + +static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf) +{ + // Check if we have to resize the inode during write + kv_read_inode(self, inode, [=](int res, const std::string & value, json11::Json attrs) + { + if (res < 0) + { + *(WRITE3res*)rop->reply = (WRITE3res){ .status = vitastor_nfs_map_err(-res) }; + rpc_queue_reply(rop); + } + else if (kv_map_type(attrs["type"].string_value()) != NF3REG) + { + *(WRITE3res*)rop->reply = (WRITE3res){ .status = NFS3ERR_INVAL }; + rpc_queue_reply(rop); + } + else if (attrs["size"].uint64_value() < new_size) + { + auto ewr_it = self->extend_writes.emplace((extend_size_t){ + .inode = inode, + .new_size = new_size, + }, (extend_write_t){ + .rop = rop, + .resize_res = 1, + .write_res = 1, + }); + auto ext = &self->extends[inode]; + if (ext->cur_extend > 0) + { + // Already resizing, just wait + if (ext->next_extend < new_size) + ext->next_extend = new_size; + } + else + { + ext->old_ientry = value; + ext->attrs = attrs.object_items(); + ext->cur_extend = new_size; + ext->attrs["size"] = new_size; + extend_inode(self, inode); + } + nfs_do_write(self, ewr_it, rop, inode, offset, count, buf); + } + else + { + nfs_do_write(self, self->extend_writes.end(), rop, inode, offset, count, buf); + } + }); +} diff --git a/src/nfs_mount.cpp b/src/nfs_mount.cpp new file mode 100644 index 00000000..3e85434b --- /dev/null +++ b/src/nfs_mount.cpp @@ -0,0 +1,115 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) +// +// NFS proxy - common NULL, ACCESS, COMMIT, DUMP, EXPORT, MNT, UMNT, UMNTALL + +#include + +#include "str_util.h" + +#include "nfs_proxy.h" + +#include "nfs/nfs.h" + +#include "cli.h" + +int nfs3_null_proc(void *opaque, rpc_op_t *rop) +{ + rpc_queue_reply(rop); + return 0; +} + +int nfs3_access_proc(void *opaque, rpc_op_t *rop) +{ + //nfs_client_t *self = (nfs_client_t*)opaque; + ACCESS3args *args = (ACCESS3args*)rop->request; + ACCESS3res *reply = (ACCESS3res*)rop->reply; + *reply = (ACCESS3res){ + .status = NFS3_OK, + .resok = (ACCESS3resok){ + .access = args->access, + }, + }; + rpc_queue_reply(rop); + return 0; +} + +int nfs3_commit_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + //COMMIT3args *args = (COMMIT3args*)rop->request; + cluster_op_t *op = new cluster_op_t; + // fsync. we don't know how to fsync a single inode, so just fsync everything + op->opcode = OSD_OP_SYNC; + op->callback = [self, rop](cluster_op_t *op) + { + COMMIT3res *reply = (COMMIT3res*)rop->reply; + *reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) }; + *(uint64_t*)reply->resok.verf = self->parent->server_id; + rpc_queue_reply(rop); + }; + self->parent->cli->execute(op); + return 1; +} + +int mount3_mnt_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + //nfs_dirpath *args = (nfs_dirpath*)rop->request; + if (self->parent->trace) + fprintf(stderr, "[%d] MNT\n", self->nfs_fd); + nfs_mountres3 *reply = (nfs_mountres3*)rop->reply; + u_int flavor = RPC_AUTH_NONE; + reply->fhs_status = MNT3_OK; + reply->mountinfo.fhandle = xdr_copy_string(rop->xdrs, KV_ROOT_HANDLE); + reply->mountinfo.auth_flavors.auth_flavors_len = 1; + reply->mountinfo.auth_flavors.auth_flavors_val = (u_int*)xdr_copy_string(rop->xdrs, (char*)&flavor, sizeof(u_int)).data; + rpc_queue_reply(rop); + return 0; +} + +int mount3_dump_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + if (self->parent->trace) + fprintf(stderr, "[%d] DUMP\n", self->nfs_fd); + nfs_mountlist *reply = (nfs_mountlist*)rop->reply; + *reply = (struct nfs_mountbody*)malloc_or_die(sizeof(struct nfs_mountbody)); + xdr_add_malloc(rop->xdrs, *reply); + (*reply)->ml_hostname = xdr_copy_string(rop->xdrs, "127.0.0.1"); + (*reply)->ml_directory = xdr_copy_string(rop->xdrs, self->parent->export_root); + (*reply)->ml_next = NULL; + rpc_queue_reply(rop); + return 0; +} + +int mount3_umnt_proc(void *opaque, rpc_op_t *rop) +{ + //nfs_client_t *self = (nfs_client_t*)opaque; + //nfs_dirpath *arg = (nfs_dirpath*)rop->request; + // do nothing + rpc_queue_reply(rop); + return 0; +} + +int mount3_umntall_proc(void *opaque, rpc_op_t *rop) +{ + // do nothing + rpc_queue_reply(rop); + return 0; +} + +int mount3_export_proc(void *opaque, rpc_op_t *rop) +{ + nfs_client_t *self = (nfs_client_t*)opaque; + nfs_exports *reply = (nfs_exports*)rop->reply; + *reply = (struct nfs_exportnode*)calloc_or_die(1, sizeof(struct nfs_exportnode) + sizeof(struct nfs_groupnode)); + xdr_add_malloc(rop->xdrs, *reply); + (*reply)->ex_dir = xdr_copy_string(rop->xdrs, self->parent->export_root); + (*reply)->ex_groups = (struct nfs_groupnode*)(reply+1); + (*reply)->ex_groups->gr_name = xdr_copy_string(rop->xdrs, "127.0.0.1"); + (*reply)->ex_groups->gr_next = NULL; + (*reply)->ex_next = NULL; + rpc_queue_reply(rop); + return 0; +} diff --git a/src/nfs_proxy.h b/src/nfs_proxy.h index ea7b1307..98ca569c 100644 --- a/src/nfs_proxy.h +++ b/src/nfs_proxy.h @@ -7,6 +7,9 @@ #include "kv_db.h" #define RPC_INIT_BUF_SIZE 32768 +#define MAX_REQUEST_SIZE 128*1024*1024 +#define TRUE 1 +#define FALSE 0 class cli_tool_t; @@ -156,3 +159,54 @@ public: bool deref(); void stop(); }; + +// FIXME: Move to "impl" +#include "nfs/nfs.h" + +#define KV_ROOT_INODE 1 +#define KV_NEXT_ID_KEY "id" +#define KV_ROOT_HANDLE "R" + +nfsstat3 vitastor_nfs_map_err(int err); +nfstime3 nfstime_from_str(const std::string & s); +std::string nfstime_to_str(nfstime3 t); +int kv_map_type(const std::string & type); +fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs); +std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename); +std::string kv_direntry_filename(const std::string & key); +std::string kv_inode_key(uint64_t ino); +std::string kv_fh(uint64_t ino); +uint64_t kv_fh_inode(const std::string & fh); +bool kv_fh_valid(const std::string & fh); +void allocate_new_id(nfs_client_t *self, std::function cb); +void kv_read_inode(nfs_client_t *self, uint64_t ino, + std::function cb, + bool allow_cache = false); + +int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_readlink_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_create_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_symlink_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_mknod_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_remove_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_rename_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_readdir_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_fsstat_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop); +int kv_nfs3_pathconf_proc(void *opaque, rpc_op_t *rop); +int nfs3_access_proc(void *opaque, rpc_op_t *rop); +int nfs3_null_proc(void *opaque, rpc_op_t *rop); +int nfs3_commit_proc(void *opaque, rpc_op_t *rop); +int mount3_mnt_proc(void *opaque, rpc_op_t *rop); +int mount3_dump_proc(void *opaque, rpc_op_t *rop); +int mount3_umnt_proc(void *opaque, rpc_op_t *rop); +int mount3_umntall_proc(void *opaque, rpc_op_t *rop); +int mount3_export_proc(void *opaque, rpc_op_t *rop);