From ecfc753e932efaca1d7b3d964173ebf61d125d54 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 25 Feb 2024 02:27:10 +0300 Subject: [PATCH] Add basic NFS tests, fix bugs --- .gitea/workflows/buildenv.Dockerfile | 2 +- .gitea/workflows/test.yml | 18 +++ src/nfs_kv.cpp | 24 +++- src/nfs_kv.h | 5 +- src/nfs_kv_create.cpp | 7 +- src/nfs_kv_getattr.cpp | 2 + src/nfs_kv_read.cpp | 9 +- src/nfs_kv_readdir.cpp | 12 +- src/nfs_kv_remove.cpp | 2 +- src/nfs_kv_setattr.cpp | 14 +- src/nfs_kv_write.cpp | 190 ++++++++++++++++++--------- src/nfs_proxy.cpp | 17 ++- tests/run_tests.sh | 2 + tests/test_nfs.sh | 149 +++++++++++++++++++++ 14 files changed, 367 insertions(+), 86 deletions(-) create mode 100755 tests/test_nfs.sh diff --git a/.gitea/workflows/buildenv.Dockerfile b/.gitea/workflows/buildenv.Dockerfile index 4e4ae6d1..8f6ca4a1 100644 --- a/.gitea/workflows/buildenv.Dockerfile +++ b/.gitea/workflows/buildenv.Dockerfile @@ -22,7 +22,7 @@ RUN apt-get update RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \ liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` -RUN apt-get -y install jq lp-solve sudo +RUN apt-get -y install jq lp-solve sudo nfs-common RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` RUN set -ex; \ diff --git a/.gitea/workflows/test.yml b/.gitea/workflows/test.yml index 683c1d58..0635c3bd 100644 --- a/.gitea/workflows/test.yml +++ b/.gitea/workflows/test.yml @@ -856,3 +856,21 @@ jobs: echo "" done + test_nfs: + runs-on: ubuntu-latest + needs: build + container: ${{env.TEST_IMAGE}}:${{github.sha}} + steps: + - name: Run test + id: test + timeout-minutes: 3 + run: /root/vitastor/tests/test_nfs.sh + - name: Print logs + if: always() && steps.test.outcome == 'failure' + run: | + for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do + echo "-------- $i --------" + cat $i + echo "" + done + diff --git a/src/nfs_kv.cpp b/src/nfs_kv.cpp index 2dc28d04..46166d54 100644 --- a/src/nfs_kv.cpp +++ b/src/nfs_kv.cpp @@ -29,11 +29,29 @@ nfstime3 nfstime_from_str(const std::string & s) return t; } +static std::string timespec_to_str(timespec t) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ju.%09ju", t.tv_sec, t.tv_nsec); + int l = strlen(buf); + while (l > 0 && buf[l-1] == '0') + l--; + if (l > 0 && buf[l-1] == '.') + l--; + buf[l] = 0; + return buf; +} + std::string nfstime_to_str(nfstime3 t) { - char buf[32]; - snprintf(buf, sizeof(buf), "%u.%09u", t.seconds, t.nseconds); - return buf; + return timespec_to_str((timespec){ .tv_sec = t.seconds, .tv_nsec = t.nseconds }); +} + +std::string nfstime_now_str() +{ + timespec t; + clock_gettime(CLOCK_REALTIME, &t); + return timespec_to_str(t); } int kv_map_type(const std::string & type) diff --git a/src/nfs_kv.h b/src/nfs_kv.h index 34c92e89..7749c9cb 100644 --- a/src/nfs_kv.h +++ b/src/nfs_kv.h @@ -46,7 +46,7 @@ struct kv_inode_extend_t struct kv_fs_state_t { std::map list_cookies; - uint64_t fs_next_id = 0, fs_allocated_id = 0; + uint64_t fs_next_id = 1, fs_allocated_id = 0; std::vector unallocated_ids; std::vector allocating_shared; uint64_t cur_shared_inode = 0, cur_shared_offset = 0; @@ -57,12 +57,13 @@ struct shared_file_header_t { uint64_t magic = 0; uint64_t inode = 0; - uint64_t size = 0; + uint64_t alloc = 0; }; nfsstat3 vitastor_nfs_map_err(int err); nfstime3 nfstime_from_str(const std::string & s); std::string nfstime_to_str(nfstime3 t); +std::string nfstime_now_str(); int kv_map_type(const std::string & type); fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs); std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename); diff --git a/src/nfs_kv_create.cpp b/src/nfs_kv_create.cpp index 1f6668f3..4019cc70 100644 --- a/src/nfs_kv_create.cpp +++ b/src/nfs_kv_create.cpp @@ -94,6 +94,10 @@ static void kv_do_create(kv_create_state *st) cb(-EINVAL); return; } + if (st->attrobj.find("mtime") == st->attrobj.end()) + st->attrobj["mtime"] = nfstime_now_str(); + if (st->attrobj.find("atime") == st->attrobj.end()) + st->attrobj["atime"] = st->attrobj["mtime"]; // Generate inode ID allocate_new_id(st->self, [st](int res, uint64_t new_id) { @@ -105,7 +109,8 @@ static void kv_do_create(kv_create_state *st) } st->new_id = new_id; auto direntry = json11::Json::object{ { "ino", st->new_id } }; - if (st->attrobj["type"].string_value() == "dir") + if (st->attrobj.find("type") != st->attrobj.end() && + st->attrobj["type"].string_value() == "dir") { direntry["type"] = "dir"; } diff --git a/src/nfs_kv_getattr.cpp b/src/nfs_kv_getattr.cpp index c7c218fa..2ebf22fa 100644 --- a/src/nfs_kv_getattr.cpp +++ b/src/nfs_kv_getattr.cpp @@ -57,6 +57,8 @@ int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop) } kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json attrs) { + if (self->parent->trace) + fprintf(stderr, "[%d] GETATTR %ju -> %s\n", self->nfs_fd, ino, value.c_str()); if (res < 0) { *reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) }; diff --git a/src/nfs_kv_read.cpp b/src/nfs_kv_read.cpp index c5e6d2b3..aa36c887 100644 --- a/src/nfs_kv_read.cpp +++ b/src/nfs_kv_read.cpp @@ -85,8 +85,7 @@ resume_2: return; } auto hdr = ((shared_file_header_t*)st->aligned_buf); - if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino || - align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value())) + if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino) { // Got unrelated data - retry from the beginning free(st->aligned_buf); @@ -101,7 +100,7 @@ resume_2: } } st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1)); - st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment) & + st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1)) - st->aligned_offset; st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); st->buf = st->aligned_buf + st->offset - st->aligned_offset; @@ -121,7 +120,7 @@ resume_2: return; resume_3: auto cb = std::move(st->cb); - cb(st->res); + cb(st->res < 0 ? st->res : 0); return; } @@ -157,6 +156,8 @@ int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop) rpc_queue_reply(st->rop); delete st; }; + if (st->self->parent->trace) + fprintf(stderr, "[%d] READ %ju %ju+%ju\n", st->self->nfs_fd, st->ino, st->offset, st->size); nfs_kv_continue_read(st, 0); return 1; } diff --git a/src/nfs_kv_readdir.cpp b/src/nfs_kv_readdir.cpp index b7e8c409..80289c82 100644 --- a/src/nfs_kv_readdir.cpp +++ b/src/nfs_kv_readdir.cpp @@ -188,7 +188,7 @@ resume_2: auto lc_it = st->self->parent->kvfs->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie }); if (lc_it != st->self->parent->kvfs->list_cookies.end()) { - st->start = lc_it->second.key; + st->start = st->prefix+lc_it->second.key; st->to_skip = 1; st->offset = st->cookie; } @@ -235,7 +235,7 @@ resume_2: st->self->parent->db->list_next(st->list_handle, NULL); return; resume_3: - if (st->res == -ENOENT || st->cur_key.size() > st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix) + if (st->res == -ENOENT || st->cur_key.size() < st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix) { st->self->parent->db->list_close(st->list_handle); st->list_handle = NULL; @@ -256,6 +256,11 @@ resume_3: } auto ino = direntry["ino"].uint64_value(); auto name = kv_direntry_filename(st->cur_key); + if (st->self->parent->trace) + { + fprintf(stderr, "[%d] READDIR %ju %lu %s\n", + st->self->nfs_fd, st->dir_ino, st->offset, name.c_str()); + } auto fh = kv_fh(ino); // 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes // 1 entryplus3 is (8+4+(filename_len+3)/4*4+8 @@ -276,7 +281,7 @@ resume_3: entry->name = xdr_copy_string(st->rop->xdrs, name); entry->fileid = ino; entry->cookie = st->offset++; - st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = entry->name }; + st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = name }; if (st->is_plus) { entry->name_handle = (post_op_fh3){ @@ -285,7 +290,6 @@ resume_3: }; kv_getattr_next(st); } - st->self->parent->db->list_next(st->list_handle, NULL); } resume_4: while (st->getattr_running > 0) diff --git a/src/nfs_kv_remove.cpp b/src/nfs_kv_remove.cpp index 98010279..f515e2f4 100644 --- a/src/nfs_kv_remove.cpp +++ b/src/nfs_kv_remove.cpp @@ -225,7 +225,7 @@ resume_6: } // (6) If regular file and inode is deleted: delete data if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1 && - !st->ientry["shared_inode"].uint64_value()) + !st->ientry["shared_ino"].uint64_value()) { // Remove data st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { diff --git a/src/nfs_kv_setattr.cpp b/src/nfs_kv_setattr.cpp index 35aa9e71..e8223836 100644 --- a/src/nfs_kv_setattr.cpp +++ b/src/nfs_kv_setattr.cpp @@ -99,7 +99,8 @@ resume_2: return; } if (!st->set_attrs["size"].is_null() && - st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value()) + st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value() && + !st->ientry["shared_ino"].uint64_value()) { // Delete extra data when downsizing st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { @@ -147,11 +148,16 @@ int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop) st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid; if (args->new_attributes.gid.set_it) st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid; - if (args->new_attributes.atime.set_it) + if (args->new_attributes.atime.set_it == SET_TO_SERVER_TIME) + st->set_attrs["atime"] = nfstime_now_str(); + else if (args->new_attributes.atime.set_it == SET_TO_CLIENT_TIME) st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime); - if (args->new_attributes.mtime.set_it) + if (args->new_attributes.mtime.set_it == SET_TO_SERVER_TIME) + st->set_attrs["mtime"] = nfstime_now_str(); + else if (args->new_attributes.mtime.set_it == SET_TO_CLIENT_TIME) st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime); - fprintf(stderr, "SETATTR %ju ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str()); + if (st->self->parent->trace) + fprintf(stderr, "[%d] SETATTR %ju ATTRS %s\n", st->self->nfs_fd, st->ino, json11::Json(st->set_attrs).dump().c_str()); st->cb = [st](int res) { auto reply = (SETATTR3res*)st->rop->reply; diff --git a/src/nfs_kv_write.cpp b/src/nfs_kv_write.cpp index 053de596..59a9be5d 100644 --- a/src/nfs_kv_write.cpp +++ b/src/nfs_kv_write.cpp @@ -17,6 +17,7 @@ struct nfs_rmw_t uint8_t *buf = NULL; uint64_t size = 0; uint8_t *part_buf = NULL; + uint64_t version = 0; }; struct nfs_kv_write_state @@ -106,6 +107,14 @@ static void allocate_shared_inode(nfs_kv_write_state *st, int state, uint64_t si ); }); } + else + { + st->res = 0; + st->shared_inode = st->self->parent->kvfs->cur_shared_inode; + st->shared_offset = st->self->parent->kvfs->cur_shared_offset; + st->self->parent->kvfs->cur_shared_offset += (size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1); + nfs_kv_continue_write(st, state); + } } uint64_t align_shared_size(nfs_client_t *self, uint64_t size) @@ -154,10 +163,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw) { auto parent = rmw->st->self->parent; auto align = parent->pool_alignment; - bool is_begin = (rmw->offset % align); - bool is_end = ((rmw->offset+rmw->size) % align); - // RMW either only at beginning or only at end and within a single block - assert(is_begin != is_end); + assert(rmw->size < align); assert((rmw->offset/parent->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->pool_block_size)); if (!rmw->part_buf) { @@ -166,7 +172,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw) auto op = new cluster_op_t; op->opcode = OSD_OP_READ; op->inode = parent->fs_base_inode + rmw->ino; - op->offset = (rmw->offset + (is_begin ? 0 : rmw->size)) & ~(align-1); + op->offset = rmw->offset & ~(align-1); op->len = align; op->iov.push_back(rmw->part_buf, op->len); rmw->st->waiting++; @@ -185,30 +191,43 @@ static void nfs_do_rmw(nfs_rmw_t *rmw) } else { + if (!rmw->version) + { + auto st = rmw->st; + rmw->version = rd_op->version+1; + if (st->rmw[0].st && st->rmw[1].st && + st->rmw[0].offset/st->self->parent->pool_block_size == st->rmw[1].offset/st->self->parent->pool_block_size) + { + // Same block... RMWs should be sequential + int other = rmw == &st->rmw[0] ? 1 : 0; + st->rmw[other].version = rmw->version+1; + } + } auto parent = rmw->st->self->parent; auto align = parent->pool_alignment; bool is_begin = (rmw->offset % align); + bool is_end = ((rmw->offset+rmw->size) % align); auto op = new cluster_op_t; op->opcode = OSD_OP_WRITE; op->inode = rmw->st->self->parent->fs_base_inode + rmw->ino; op->offset = rmw->offset & ~(align-1); - op->len = (rmw->size + align-1) & ~(align-1); - op->version = rd_op->version+1; + op->len = align; + op->version = rmw->version; if (is_begin) { op->iov.push_back(rmw->part_buf, rmw->offset % align); } op->iov.push_back(rmw->buf, rmw->size); - if (!is_begin) + if (is_end) { - auto tail = ((rmw->offset+rmw->size) % align); - op->iov.push_back(rmw->part_buf + tail, align - tail); + op->iov.push_back(rmw->part_buf + (rmw->offset % align) + rmw->size, align - (rmw->offset % align) - rmw->size); } op->callback = [rmw](cluster_op_t *op) { - if (op->retval == -EAGAIN) + if (op->retval == -EINTR) { // CAS failure - retry + rmw->version = 0; rmw->st->waiting--; nfs_do_rmw(rmw); } @@ -272,11 +291,12 @@ static bool nfs_do_shared_readmodify(nfs_kv_write_state *st, int base_state, int else if (state == base_state) goto resume_0; assert(!st->aligned_buf); st->aligned_size = unshare - ? sizeof(shared_file_header_t) + (st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1) + ? sizeof(shared_file_header_t) + ((st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1)) : align_shared_size(st->self, st->new_size); st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); memset(st->aligned_buf + sizeof(shared_file_header_t), 0, st->offset); - if (st->ientry["shared_ino"].uint64_value() != 0) + if (st->ientry["shared_ino"].uint64_value() != 0 && + st->ientry["size"].uint64_value() != 0) { // Read old data if shared non-empty nfs_do_shared_read(st, base_state); @@ -286,14 +306,15 @@ resume_0: { auto cb = std::move(st->cb); cb(st->res); - return true; + return false; } auto hdr = ((shared_file_header_t*)st->aligned_buf); - if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino || - align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value())) + if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino) { // Got unrelated data - retry from the beginning st->allow_cache = false; + free(st->aligned_buf); + st->aligned_buf = NULL; nfs_kv_continue_write(st, 0); return false; } @@ -301,7 +322,7 @@ resume_0: *((shared_file_header_t*)st->aligned_buf) = { .magic = SHARED_FILE_MAGIC_V1, .inode = st->ino, - .size = st->new_size, + .alloc = st->aligned_size, }; memcpy(st->aligned_buf + sizeof(shared_file_header_t) + st->offset, st->buf, st->size); memset(st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size, 0, @@ -316,6 +337,8 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of uint64_t good_offset = offset; uint64_t good_size = st->size; st->waiting++; + st->rmw[0].st = NULL; + st->rmw[1].st = NULL; if (offset % alignment) { // Requires read-modify-write in the beginning @@ -337,17 +360,18 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of .buf = st->buf, .size = s, }; + // FIXME: skip rmw at shared beginning nfs_do_rmw(&st->rmw[0]); } - if ((offset+st->size-1) % alignment) + if ((offset+st->size) % alignment) { // Requires read-modify-write in the end - auto s = ((offset+st->size-1) % alignment); + auto s = ((offset+st->size) % alignment); if (good_size > s) good_size -= s; else good_size = 0; - if (((offset+st->size-1) / alignment) > (offset / alignment)) + if ((offset+st->size)/alignment > offset/alignment) { st->rmw[1] = { .st = st, @@ -357,6 +381,7 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of .buf = st->buf + st->size-s, .size = s, }; + // FIXME: skip rmw at end nfs_do_rmw(&st->rmw[1]); } } @@ -405,19 +430,28 @@ static std::string new_shared_ientry(nfs_kv_write_state *st) return json11::Json(ni).dump(); } -static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state) +static std::string new_unshared_ientry(nfs_kv_write_state *st) { - if (state == 1) - { + auto ni = st->ientry.object_items(); + ni.erase("empty"); + ni.erase("shared_ino"); + ni.erase("shared_offset"); + ni.erase("shared_alloc"); + ni.erase("shared_ver"); + return json11::Json(ni).dump(); +} + +static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state, int base_state) +{ + if (state == base_state+1) goto resume_1; - } st->ext->cur_extend = st->ext->next_extend; st->ext->next_extend = 0; st->res2 = -EAGAIN; - st->self->parent->db->set(kv_inode_key(st->ino), new_normal_ientry(st), [st](int res) + st->self->parent->db->set(kv_inode_key(st->ino), new_normal_ientry(st), [st, base_state](int res) { st->res = res; - nfs_kv_continue_write(st, 13); + nfs_kv_continue_write(st, base_state+1); }, [st](int res, const std::string & old_value) { if (res != 0) @@ -432,6 +466,7 @@ static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state) auto ientry = json11::Json::parse(old_value, err).object_items(); if (err != "") { + fprintf(stderr, "Invalid JSON in inode %lu = %s: %s\n", st->ino, old_value.c_str(), err.c_str()); st->res2 = -EINVAL; return false; } @@ -501,24 +536,20 @@ resume_1: // - In parallel: check if data fits into inode size and extend if it doesn't // - If CAS failure: re-read inode and retry to extend the size // - If shared: -// - Read whole file from shared inode -// - If the file header in data doesn't match: re-read inode and restart // - If data doesn't fit into the same shared inode: // - Allocate space in a new shared inode +// - Read whole file from shared inode // - Write data into the new shared inode // - If CAS failure: allocate another shared inode and retry // - Update inode metadata (set new size and new shared inode) // - If CAS failure: free allocated shared space, re-read inode and restart // - If it fits: -// - Write updated data into the shared inode +// - Update shared inode data in-place // - Update inode entry in any case to block parallel non-shared writes // - If CAS failure: re-read inode and restart // - Otherwise: -// - Write data into non-shared inode -// - Read inode in parallel -// - If not a regular file: -// - Remove data -// - Stop with -EINVAL +// - Read inode +// - If not a regular file - stop with -EINVAL // - If shared: // - Read whole file from shared inode // - Write data into non-shared inode @@ -526,9 +557,9 @@ resume_1: // - Update inode metadata (make non-shared, update size) // - If CAS failure: restart // - Zero out the shared inode header -// - If CAS failure: restart -// - Check if size fits -// - Extend if it doesn't +// - Write data into non-shared inode +// - Check if size fits +// - Extend if it doesn't // Read: // - If (offset+size <= threshold): // - Read inode from cache @@ -557,6 +588,9 @@ static void nfs_kv_continue_write(nfs_kv_write_state *st, int state) else if (state == 11) goto resume_11; else if (state == 12) goto resume_12; else if (state == 13) goto resume_13; + else if (state == 14) goto resume_14; + else if (state == 15) goto resume_15; + else if (state == 16) goto resume_16; else { fprintf(stderr, "BUG: invalid state in nfs_kv_continue_write()"); @@ -578,9 +612,7 @@ resume_0: }, st->allow_cache); return; resume_1: - if (st->res < 0 || - st->ientry["type"].uint64_value() != 0 && - st->ientry["type"].uint64_value() != NF3REG) + if (st->res < 0 || kv_map_type(st->ientry["type"].string_value()) != NF3REG) { auto cb = std::move(st->cb); cb(st->res == 0 ? -EINVAL : st->res); @@ -594,11 +626,11 @@ resume_1: } if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold) { - if (st->ientry["size"].uint64_value() == 0 || + if (st->ientry["size"].uint64_value() == 0 && + st->ientry["shared_ino"].uint64_value() == 0 || st->ientry["empty"].bool_value() && - st->ientry["size"].uint64_value() + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold || + (st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->shared_inode_threshold || st->ientry["shared_ino"].uint64_value() != 0 && - st->ientry["size"].uint64_value() < st->offset+st->size && st->ientry["shared_alloc"].uint64_value() < align_shared_size(st->self, st->offset+st->size)) { // Either empty, or shared and requires moving into a larger place (redirect-write) @@ -669,17 +701,18 @@ resume_7: nfs_do_fsync(st, 8); return; } +resume_8: // We always have to change inode entry on shared writes st->self->parent->db->set(kv_inode_key(st->ino), new_shared_ientry(st), [st](int res) { st->res = res; - nfs_kv_continue_write(st, 8); + nfs_kv_continue_write(st, 9); }, [st](int res, const std::string & old_value) { return res == 0 && old_value == st->ientry_text; }); return; -resume_8: +resume_9: if (st->res == -EAGAIN) { goto resume_0; @@ -690,28 +723,55 @@ resume_8: } // Fall through for non-shared } - // Non-shared write + // Unshare? if (st->ientry["shared_ino"].uint64_value() != 0) { - // Unshare -resume_9: - if (!nfs_do_shared_readmodify(st, 9, state, true)) + if (st->ientry["size"].uint64_value() != 0) + { + assert(!st->aligned_buf); + st->aligned_size = align_shared_size(st->self, st->ientry["size"].uint64_value()); + st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); + nfs_do_shared_read(st, 10); return; - nfs_do_unshare_write(st, 10); - return; - } - else - { - // Just write - nfs_do_align_write(st, st->ino, st->offset, 10); - } resume_10: + nfs_do_unshare_write(st, 11); + return; +resume_11: + ; + } + st->self->parent->db->set(kv_inode_key(st->ino), new_unshared_ientry(st), [st](int res) + { + st->res = res; + nfs_kv_continue_write(st, 12); + }, [st](int res, const std::string & old_value) + { + return res == 0 && old_value == st->ientry_text; + }); + return; +resume_12: + if (st->res == -EAGAIN) + { + // Restart + goto resume_0; + } + if (st->res < 0) + { + auto cb = std::move(st->cb); + cb(st->res); + return; + } + st->ientry_text = new_unshared_ientry(st); + } + // Non-shared write + nfs_do_align_write(st, st->ino, st->offset, 13); + return; +resume_13: if (st->res == 0 && st->stable && !st->was_immediate) { - nfs_do_fsync(st, 11); + nfs_do_fsync(st, 14); return; } -resume_11: +resume_14: if (st->res < 0) { auto cb = std::move(st->cb); @@ -724,7 +784,7 @@ resume_11: { st->ext = &st->self->parent->kvfs->extends[st->ino]; st->ext->refcnt++; -resume_12: +resume_15: if (st->ext->next_extend < st->new_size) { // Aggregate inode extension requests @@ -733,15 +793,15 @@ resume_12: if (st->ext->cur_extend > 0) { // Wait for current extend which is already in progress - st->ext->waiters.push_back([st](){ nfs_kv_continue_write(st, 12); }); + st->ext->waiters.push_back([st](){ nfs_kv_continue_write(st, 15); }); return; } if (st->ext->done_extend < st->new_size) { - nfs_kv_extend_inode(st, 0); + nfs_kv_extend_inode(st, 15, 15); return; -resume_13: - nfs_kv_extend_inode(st, 1); +resume_16: + nfs_kv_extend_inode(st, 16, 15); } st->ext->refcnt--; assert(st->ext->refcnt >= 0); @@ -769,6 +829,8 @@ int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop) st->ino = kv_fh_inode(args->file); st->offset = args->offset; st->size = (args->count > args->data.size ? args->data.size : args->count); + if (st->self->parent->trace) + fprintf(stderr, "[%d] WRITE %ju %ju+%ju\n", st->self->nfs_fd, st->ino, st->offset, st->size); if (!st->ino || st->size > MAX_REQUEST_SIZE) { *reply = (WRITE3res){ .status = NFS3ERR_INVAL }; diff --git a/src/nfs_proxy.cpp b/src/nfs_proxy.cpp index 11aa055a..3428a470 100644 --- a/src/nfs_proxy.cpp +++ b/src/nfs_proxy.cpp @@ -98,7 +98,7 @@ void nfs_proxy_t::run(json11::Json cfg) srand48(tv.tv_sec*1000000000 + tv.tv_nsec); server_id = (uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62); // Parse options - trace = cfg["log_level"].uint64_value() > 5; + trace = cfg["log_level"].uint64_value() > 5 || cfg["trace"].uint64_value() > 0; bind_address = cfg["bind"].string_value(); if (bind_address == "") bind_address = "0.0.0.0"; @@ -156,7 +156,15 @@ void nfs_proxy_t::run(json11::Json cfg) check_default_pool(); // Check if we're using VitastorFS fs_kv_inode = cfg["fs"].uint64_value(); - if (!fs_kv_inode && cfg["fs"].is_string()) + if (fs_kv_inode) + { + if (!INODE_POOL(fs_kv_inode)) + { + fprintf(stderr, "FS metadata inode number must include pool\n"); + exit(1); + } + } + else if (cfg["fs"].is_string()) { for (auto & ic: cli->st_cli.inode_config) { @@ -166,6 +174,11 @@ void nfs_proxy_t::run(json11::Json cfg) break; } } + if (!fs_kv_inode) + { + fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str()); + exit(1); + } } readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value(); if (!readdir_getattr_parallel) diff --git a/tests/run_tests.sh b/tests/run_tests.sh index fcf079ec..5212792e 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -68,3 +68,5 @@ SCHEME=xor ./test_scrub.sh PG_SIZE=3 ./test_scrub.sh PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec ./test_scrub.sh SCHEME=ec ./test_scrub.sh + +./test_nfs.sh diff --git a/tests/test_nfs.sh b/tests/test_nfs.sh new file mode 100755 index 00000000..9d2521fc --- /dev/null +++ b/tests/test_nfs.sh @@ -0,0 +1,149 @@ +#!/bin/bash -ex + +PG_COUNT=16 +. `dirname $0`/run_3osds.sh + +build/src/vitastor-cli --etcd_address $ETCD_URL create -s 10G fsmeta +build/src/vitastor-nfs --fs fsmeta --etcd_address $ETCD_URL --portmap 0 --port 2050 --foreground 1 --trace 1 >>./testdata/nfs.log 2>&1 & +NFS_PID=$! + +mkdir -p testdata/nfs +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +MNT=$(pwd)/testdata/nfs +trap "sudo umount -f $MNT"' || true; kill -9 $(jobs -p)' EXIT + +# write small file +ls -l ./testdata/nfs +dd if=/dev/urandom of=./testdata/f1 bs=100k count=1 +cp testdata/f1 ./testdata/nfs/ +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +ls -l ./testdata/nfs | grep f1 +diff ./testdata/f1 ./testdata/nfs/f1 +format_green "100K file ok" + +# overwrite it inplace +dd if=/dev/urandom of=./testdata/f1_90k bs=90k count=1 +cp testdata/f1_90k ./testdata/nfs/f1 +sudo umount ./testdata/nfs/ +format_green "inplace overwrite 90K ok" +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +ls -l ./testdata/nfs | grep f1 +# create another copy +dd if=./testdata/f1_90k of=./testdata/nfs/f1_nfs bs=1M +diff ./testdata/f1_90k ./testdata/nfs/f1_nfs +sudo umount ./testdata/nfs/ +format_green "another copy 90K ok" +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +ls -l ./testdata/nfs | grep f1 +cp ./testdata/nfs/f1 ./testdata/f1_nfs +diff ./testdata/f1_90k ./testdata/nfs/f1 +format_green "90K data ok" + +# move it inplace +dd if=/dev/urandom of=./testdata/f1_110k bs=110k count=1 +cp testdata/f1_110k ./testdata/nfs/f1 +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +ls -l ./testdata/nfs | grep f1 +diff ./testdata/f1_110k ./testdata/nfs/f1 +format_green "move shared 90K -> 110K ok" + +# extend it to large file + rm +dd if=/dev/urandom of=./testdata/f1_2M bs=2M count=1 +cp ./testdata/f1_2M ./testdata/nfs/f1 +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +ls -l ./testdata/nfs | grep f1 +cp ./testdata/nfs/f1 ./testdata/f1_nfs +diff ./testdata/f1_2M ./testdata/nfs/f1 +rm ./testdata/nfs/f1 +format_green "extend to 2M + rm ok" + +# mkdir +mkdir -p ./testdata/nfs/dir1/dir2 +echo abcdef > ./testdata/nfs/dir1/dir2/hnpfls +# rename dir +mv ./testdata/nfs/dir1 ./testdata/nfs/dir3 +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +ls -l ./testdata/nfs | grep dir3 +ls -l ./testdata/nfs/dir3 | grep dir2 +ls -l ./testdata/nfs/dir3/dir2 | grep hnpfls +echo abcdef > ./testdata/hnpfls +diff ./testdata/hnpfls ./testdata/nfs/dir3/dir2/hnpfls +format_green "rename dir with file ok" + +# touch +touch -t 202401011404 ./testdata/nfs/dir3/dir2/hnpfls +sudo chown 65534:65534 ./testdata/nfs/dir3/dir2/hnpfls +sudo chmod 755 ./testdata/nfs/dir3/dir2/hnpfls +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +T=`stat -c '%a %u %g %y' ./testdata/nfs/dir3/dir2/hnpfls | perl -pe 's/(:\d+)(.*)/$1/'` +[[ "$T" = "755 65534 65534 2024-01-01 14:04" ]] +format_green "set attrs ok" + +# move dir +mv ./testdata/nfs/dir3/dir2 ./testdata/nfs/ +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +ls -l ./testdata/nfs | grep dir3 +ls -l ./testdata/nfs | grep dir2 +format_green "move dir ok" + +# symlink, readlink +ln -s dir2 ./testdata/nfs/sym2 +[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]] +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]] +[[ "`readlink ./testdata/nfs/sym2`" = "dir2" ]] +format_green "symlink, readlink ok" + +# mknod: chr, blk, sock, fifo + remove +sudo mknod ./testdata/nfs/nod_chr c 1 5 +sudo mknod ./testdata/nfs/nod_blk b 2 6 +mkfifo ./testdata/nfs/nod_fifo +perl -e 'use Socket; socket($sock, PF_UNIX, SOCK_STREAM, undef) || die $!; bind($sock, sockaddr_un("./testdata/nfs/nod_sock")) || die $!;' +chmod 777 ./testdata/nfs/nod_* +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +[[ "`ls testdata|wc -l`" -ge 4 ]] +[[ "`stat -c '%A' ./testdata/nfs/nod_blk`" = "brwxrwxrwx" ]] +[[ "`stat -c '%A' ./testdata/nfs/nod_chr`" = "crwxrwxrwx" ]] +[[ "`stat -c '%A' ./testdata/nfs/nod_fifo`" = "prwxrwxrwx" ]] +[[ "`stat -c '%A' ./testdata/nfs/nod_sock`" = "srwxrwxrwx" ]] +sudo rm ./testdata/nfs/nod_* +format_green "mknod + rm ok" + +# hardlink +echo ABCDEF > ./testdata/nfs/linked1 +i=`stat -c '%i' ./testdata/nfs/linked1` +ln ./testdata/nfs/linked1 ./testdata/nfs/linked2 +[[ "`stat -c '%i' ./testdata/nfs/linked2`" -eq $i ]] +echo BABABA > ./testdata/nfs/linked2 +diff ./testdata/nfs/linked2 ./testdata/nfs/linked1 +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +diff ./testdata/nfs/linked2 ./testdata/nfs/linked1 +[[ "`cat ./testdata/nfs/linked2`" = "BABABA" ]] +rm ./testdata/nfs/linked2 +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +[[ "`cat ./testdata/nfs/linked1`" = "BABABA" ]] +format_green "hardlink ok" + +# rm small +ls -l ./testdata/nfs +dd if=/dev/urandom of=./testdata/nfs/smallfile bs=100k count=1 +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +rm ./testdata/nfs/smallfile +if ls ./testdata/nfs | grep smallfile; then false; fi +sudo umount ./testdata/nfs/ +sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp +if ls ./testdata/nfs | grep smallfile; then false; fi +format_green "rm small ok" + +format_green OK