Add basic NFS tests, fix bugs

master
Vitaliy Filippov 2024-02-25 02:27:10 +03:00
parent a574f9ad71
commit ecfc753e93
14 changed files with 367 additions and 86 deletions

View File

@ -22,7 +22,7 @@ RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get -y install jq lp-solve sudo
RUN apt-get -y install jq lp-solve sudo nfs-common
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN set -ex; \

View File

@ -856,3 +856,21 @@ jobs:
echo ""
done
test_nfs:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_nfs.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done

View File

@ -29,11 +29,29 @@ nfstime3 nfstime_from_str(const std::string & s)
return t;
}
static std::string timespec_to_str(timespec t)
{
char buf[64];
snprintf(buf, sizeof(buf), "%ju.%09ju", t.tv_sec, t.tv_nsec);
int l = strlen(buf);
while (l > 0 && buf[l-1] == '0')
l--;
if (l > 0 && buf[l-1] == '.')
l--;
buf[l] = 0;
return buf;
}
std::string nfstime_to_str(nfstime3 t)
{
char buf[32];
snprintf(buf, sizeof(buf), "%u.%09u", t.seconds, t.nseconds);
return buf;
return timespec_to_str((timespec){ .tv_sec = t.seconds, .tv_nsec = t.nseconds });
}
std::string nfstime_now_str()
{
timespec t;
clock_gettime(CLOCK_REALTIME, &t);
return timespec_to_str(t);
}
int kv_map_type(const std::string & type)

View File

@ -46,7 +46,7 @@ struct kv_inode_extend_t
struct kv_fs_state_t
{
std::map<list_cookie_t, list_cookie_val_t> list_cookies;
uint64_t fs_next_id = 0, fs_allocated_id = 0;
uint64_t fs_next_id = 1, fs_allocated_id = 0;
std::vector<uint64_t> unallocated_ids;
std::vector<shared_alloc_queue_t> allocating_shared;
uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
@ -57,12 +57,13 @@ struct shared_file_header_t
{
uint64_t magic = 0;
uint64_t inode = 0;
uint64_t size = 0;
uint64_t alloc = 0;
};
nfsstat3 vitastor_nfs_map_err(int err);
nfstime3 nfstime_from_str(const std::string & s);
std::string nfstime_to_str(nfstime3 t);
std::string nfstime_now_str();
int kv_map_type(const std::string & type);
fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs);
std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename);

View File

@ -94,6 +94,10 @@ static void kv_do_create(kv_create_state *st)
cb(-EINVAL);
return;
}
if (st->attrobj.find("mtime") == st->attrobj.end())
st->attrobj["mtime"] = nfstime_now_str();
if (st->attrobj.find("atime") == st->attrobj.end())
st->attrobj["atime"] = st->attrobj["mtime"];
// Generate inode ID
allocate_new_id(st->self, [st](int res, uint64_t new_id)
{
@ -105,7 +109,8 @@ static void kv_do_create(kv_create_state *st)
}
st->new_id = new_id;
auto direntry = json11::Json::object{ { "ino", st->new_id } };
if (st->attrobj["type"].string_value() == "dir")
if (st->attrobj.find("type") != st->attrobj.end() &&
st->attrobj["type"].string_value() == "dir")
{
direntry["type"] = "dir";
}

View File

@ -57,6 +57,8 @@ int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
}
kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json attrs)
{
if (self->parent->trace)
fprintf(stderr, "[%d] GETATTR %ju -> %s\n", self->nfs_fd, ino, value.c_str());
if (res < 0)
{
*reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) };

View File

@ -85,8 +85,7 @@ resume_2:
return;
}
auto hdr = ((shared_file_header_t*)st->aligned_buf);
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino ||
align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value()))
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino)
{
// Got unrelated data - retry from the beginning
free(st->aligned_buf);
@ -101,7 +100,7 @@ resume_2:
}
}
st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1));
st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment) &
st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment-1) &
~(st->self->parent->pool_alignment-1)) - st->aligned_offset;
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
st->buf = st->aligned_buf + st->offset - st->aligned_offset;
@ -121,7 +120,7 @@ resume_2:
return;
resume_3:
auto cb = std::move(st->cb);
cb(st->res);
cb(st->res < 0 ? st->res : 0);
return;
}
@ -157,6 +156,8 @@ int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop)
rpc_queue_reply(st->rop);
delete st;
};
if (st->self->parent->trace)
fprintf(stderr, "[%d] READ %ju %ju+%ju\n", st->self->nfs_fd, st->ino, st->offset, st->size);
nfs_kv_continue_read(st, 0);
return 1;
}

View File

@ -188,7 +188,7 @@ resume_2:
auto lc_it = st->self->parent->kvfs->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie });
if (lc_it != st->self->parent->kvfs->list_cookies.end())
{
st->start = lc_it->second.key;
st->start = st->prefix+lc_it->second.key;
st->to_skip = 1;
st->offset = st->cookie;
}
@ -235,7 +235,7 @@ resume_2:
st->self->parent->db->list_next(st->list_handle, NULL);
return;
resume_3:
if (st->res == -ENOENT || st->cur_key.size() > st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix)
if (st->res == -ENOENT || st->cur_key.size() < st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix)
{
st->self->parent->db->list_close(st->list_handle);
st->list_handle = NULL;
@ -256,6 +256,11 @@ resume_3:
}
auto ino = direntry["ino"].uint64_value();
auto name = kv_direntry_filename(st->cur_key);
if (st->self->parent->trace)
{
fprintf(stderr, "[%d] READDIR %ju %lu %s\n",
st->self->nfs_fd, st->dir_ino, st->offset, name.c_str());
}
auto fh = kv_fh(ino);
// 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
// 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
@ -276,7 +281,7 @@ resume_3:
entry->name = xdr_copy_string(st->rop->xdrs, name);
entry->fileid = ino;
entry->cookie = st->offset++;
st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = entry->name };
st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = name };
if (st->is_plus)
{
entry->name_handle = (post_op_fh3){
@ -285,7 +290,6 @@ resume_3:
};
kv_getattr_next(st);
}
st->self->parent->db->list_next(st->list_handle, NULL);
}
resume_4:
while (st->getattr_running > 0)

View File

@ -225,7 +225,7 @@ resume_6:
}
// (6) If regular file and inode is deleted: delete data
if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1 &&
!st->ientry["shared_inode"].uint64_value())
!st->ientry["shared_ino"].uint64_value())
{
// Remove data
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {

View File

@ -99,7 +99,8 @@ resume_2:
return;
}
if (!st->set_attrs["size"].is_null() &&
st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value())
st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value() &&
!st->ientry["shared_ino"].uint64_value())
{
// Delete extra data when downsizing
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
@ -147,11 +148,16 @@ int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid;
if (args->new_attributes.gid.set_it)
st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid;
if (args->new_attributes.atime.set_it)
if (args->new_attributes.atime.set_it == SET_TO_SERVER_TIME)
st->set_attrs["atime"] = nfstime_now_str();
else if (args->new_attributes.atime.set_it == SET_TO_CLIENT_TIME)
st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime);
if (args->new_attributes.mtime.set_it)
if (args->new_attributes.mtime.set_it == SET_TO_SERVER_TIME)
st->set_attrs["mtime"] = nfstime_now_str();
else if (args->new_attributes.mtime.set_it == SET_TO_CLIENT_TIME)
st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime);
fprintf(stderr, "SETATTR %ju ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str());
if (st->self->parent->trace)
fprintf(stderr, "[%d] SETATTR %ju ATTRS %s\n", st->self->nfs_fd, st->ino, json11::Json(st->set_attrs).dump().c_str());
st->cb = [st](int res)
{
auto reply = (SETATTR3res*)st->rop->reply;

View File

@ -17,6 +17,7 @@ struct nfs_rmw_t
uint8_t *buf = NULL;
uint64_t size = 0;
uint8_t *part_buf = NULL;
uint64_t version = 0;
};
struct nfs_kv_write_state
@ -106,6 +107,14 @@ static void allocate_shared_inode(nfs_kv_write_state *st, int state, uint64_t si
);
});
}
else
{
st->res = 0;
st->shared_inode = st->self->parent->kvfs->cur_shared_inode;
st->shared_offset = st->self->parent->kvfs->cur_shared_offset;
st->self->parent->kvfs->cur_shared_offset += (size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1);
nfs_kv_continue_write(st, state);
}
}
uint64_t align_shared_size(nfs_client_t *self, uint64_t size)
@ -154,10 +163,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
{
auto parent = rmw->st->self->parent;
auto align = parent->pool_alignment;
bool is_begin = (rmw->offset % align);
bool is_end = ((rmw->offset+rmw->size) % align);
// RMW either only at beginning or only at end and within a single block
assert(is_begin != is_end);
assert(rmw->size < align);
assert((rmw->offset/parent->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->pool_block_size));
if (!rmw->part_buf)
{
@ -166,7 +172,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
auto op = new cluster_op_t;
op->opcode = OSD_OP_READ;
op->inode = parent->fs_base_inode + rmw->ino;
op->offset = (rmw->offset + (is_begin ? 0 : rmw->size)) & ~(align-1);
op->offset = rmw->offset & ~(align-1);
op->len = align;
op->iov.push_back(rmw->part_buf, op->len);
rmw->st->waiting++;
@ -185,30 +191,43 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
}
else
{
if (!rmw->version)
{
auto st = rmw->st;
rmw->version = rd_op->version+1;
if (st->rmw[0].st && st->rmw[1].st &&
st->rmw[0].offset/st->self->parent->pool_block_size == st->rmw[1].offset/st->self->parent->pool_block_size)
{
// Same block... RMWs should be sequential
int other = rmw == &st->rmw[0] ? 1 : 0;
st->rmw[other].version = rmw->version+1;
}
}
auto parent = rmw->st->self->parent;
auto align = parent->pool_alignment;
bool is_begin = (rmw->offset % align);
bool is_end = ((rmw->offset+rmw->size) % align);
auto op = new cluster_op_t;
op->opcode = OSD_OP_WRITE;
op->inode = rmw->st->self->parent->fs_base_inode + rmw->ino;
op->offset = rmw->offset & ~(align-1);
op->len = (rmw->size + align-1) & ~(align-1);
op->version = rd_op->version+1;
op->len = align;
op->version = rmw->version;
if (is_begin)
{
op->iov.push_back(rmw->part_buf, rmw->offset % align);
}
op->iov.push_back(rmw->buf, rmw->size);
if (!is_begin)
if (is_end)
{
auto tail = ((rmw->offset+rmw->size) % align);
op->iov.push_back(rmw->part_buf + tail, align - tail);
op->iov.push_back(rmw->part_buf + (rmw->offset % align) + rmw->size, align - (rmw->offset % align) - rmw->size);
}
op->callback = [rmw](cluster_op_t *op)
{
if (op->retval == -EAGAIN)
if (op->retval == -EINTR)
{
// CAS failure - retry
rmw->version = 0;
rmw->st->waiting--;
nfs_do_rmw(rmw);
}
@ -272,11 +291,12 @@ static bool nfs_do_shared_readmodify(nfs_kv_write_state *st, int base_state, int
else if (state == base_state) goto resume_0;
assert(!st->aligned_buf);
st->aligned_size = unshare
? sizeof(shared_file_header_t) + (st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1)
? sizeof(shared_file_header_t) + ((st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1))
: align_shared_size(st->self, st->new_size);
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
memset(st->aligned_buf + sizeof(shared_file_header_t), 0, st->offset);
if (st->ientry["shared_ino"].uint64_value() != 0)
if (st->ientry["shared_ino"].uint64_value() != 0 &&
st->ientry["size"].uint64_value() != 0)
{
// Read old data if shared non-empty
nfs_do_shared_read(st, base_state);
@ -286,14 +306,15 @@ resume_0:
{
auto cb = std::move(st->cb);
cb(st->res);
return true;
return false;
}
auto hdr = ((shared_file_header_t*)st->aligned_buf);
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino ||
align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value()))
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino)
{
// Got unrelated data - retry from the beginning
st->allow_cache = false;
free(st->aligned_buf);
st->aligned_buf = NULL;
nfs_kv_continue_write(st, 0);
return false;
}
@ -301,7 +322,7 @@ resume_0:
*((shared_file_header_t*)st->aligned_buf) = {
.magic = SHARED_FILE_MAGIC_V1,
.inode = st->ino,
.size = st->new_size,
.alloc = st->aligned_size,
};
memcpy(st->aligned_buf + sizeof(shared_file_header_t) + st->offset, st->buf, st->size);
memset(st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size, 0,
@ -316,6 +337,8 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of
uint64_t good_offset = offset;
uint64_t good_size = st->size;
st->waiting++;
st->rmw[0].st = NULL;
st->rmw[1].st = NULL;
if (offset % alignment)
{
// Requires read-modify-write in the beginning
@ -337,17 +360,18 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of
.buf = st->buf,
.size = s,
};
// FIXME: skip rmw at shared beginning
nfs_do_rmw(&st->rmw[0]);
}
if ((offset+st->size-1) % alignment)
if ((offset+st->size) % alignment)
{
// Requires read-modify-write in the end
auto s = ((offset+st->size-1) % alignment);
auto s = ((offset+st->size) % alignment);
if (good_size > s)
good_size -= s;
else
good_size = 0;
if (((offset+st->size-1) / alignment) > (offset / alignment))
if ((offset+st->size)/alignment > offset/alignment)
{
st->rmw[1] = {
.st = st,
@ -357,6 +381,7 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of
.buf = st->buf + st->size-s,
.size = s,
};
// FIXME: skip rmw at end
nfs_do_rmw(&st->rmw[1]);
}
}
@ -405,19 +430,28 @@ static std::string new_shared_ientry(nfs_kv_write_state *st)
return json11::Json(ni).dump();
}
static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state)
static std::string new_unshared_ientry(nfs_kv_write_state *st)
{
if (state == 1)
{
auto ni = st->ientry.object_items();
ni.erase("empty");
ni.erase("shared_ino");
ni.erase("shared_offset");
ni.erase("shared_alloc");
ni.erase("shared_ver");
return json11::Json(ni).dump();
}
static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state, int base_state)
{
if (state == base_state+1)
goto resume_1;
}
st->ext->cur_extend = st->ext->next_extend;
st->ext->next_extend = 0;
st->res2 = -EAGAIN;
st->self->parent->db->set(kv_inode_key(st->ino), new_normal_ientry(st), [st](int res)
st->self->parent->db->set(kv_inode_key(st->ino), new_normal_ientry(st), [st, base_state](int res)
{
st->res = res;
nfs_kv_continue_write(st, 13);
nfs_kv_continue_write(st, base_state+1);
}, [st](int res, const std::string & old_value)
{
if (res != 0)
@ -432,6 +466,7 @@ static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state)
auto ientry = json11::Json::parse(old_value, err).object_items();
if (err != "")
{
fprintf(stderr, "Invalid JSON in inode %lu = %s: %s\n", st->ino, old_value.c_str(), err.c_str());
st->res2 = -EINVAL;
return false;
}
@ -501,24 +536,20 @@ resume_1:
// - In parallel: check if data fits into inode size and extend if it doesn't
// - If CAS failure: re-read inode and retry to extend the size
// - If shared:
// - Read whole file from shared inode
// - If the file header in data doesn't match: re-read inode and restart
// - If data doesn't fit into the same shared inode:
// - Allocate space in a new shared inode
// - Read whole file from shared inode
// - Write data into the new shared inode
// - If CAS failure: allocate another shared inode and retry
// - Update inode metadata (set new size and new shared inode)
// - If CAS failure: free allocated shared space, re-read inode and restart
// - If it fits:
// - Write updated data into the shared inode
// - Update shared inode data in-place
// - Update inode entry in any case to block parallel non-shared writes
// - If CAS failure: re-read inode and restart
// - Otherwise:
// - Write data into non-shared inode
// - Read inode in parallel
// - If not a regular file:
// - Remove data
// - Stop with -EINVAL
// - Read inode
// - If not a regular file - stop with -EINVAL
// - If shared:
// - Read whole file from shared inode
// - Write data into non-shared inode
@ -526,9 +557,9 @@ resume_1:
// - Update inode metadata (make non-shared, update size)
// - If CAS failure: restart
// - Zero out the shared inode header
// - If CAS failure: restart
// - Check if size fits
// - Extend if it doesn't
// - Write data into non-shared inode
// - Check if size fits
// - Extend if it doesn't
// Read:
// - If (offset+size <= threshold):
// - Read inode from cache
@ -557,6 +588,9 @@ static void nfs_kv_continue_write(nfs_kv_write_state *st, int state)
else if (state == 11) goto resume_11;
else if (state == 12) goto resume_12;
else if (state == 13) goto resume_13;
else if (state == 14) goto resume_14;
else if (state == 15) goto resume_15;
else if (state == 16) goto resume_16;
else
{
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_write()");
@ -578,9 +612,7 @@ resume_0:
}, st->allow_cache);
return;
resume_1:
if (st->res < 0 ||
st->ientry["type"].uint64_value() != 0 &&
st->ientry["type"].uint64_value() != NF3REG)
if (st->res < 0 || kv_map_type(st->ientry["type"].string_value()) != NF3REG)
{
auto cb = std::move(st->cb);
cb(st->res == 0 ? -EINVAL : st->res);
@ -594,11 +626,11 @@ resume_1:
}
if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold)
{
if (st->ientry["size"].uint64_value() == 0 ||
if (st->ientry["size"].uint64_value() == 0 &&
st->ientry["shared_ino"].uint64_value() == 0 ||
st->ientry["empty"].bool_value() &&
st->ientry["size"].uint64_value() + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold ||
(st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->shared_inode_threshold ||
st->ientry["shared_ino"].uint64_value() != 0 &&
st->ientry["size"].uint64_value() < st->offset+st->size &&
st->ientry["shared_alloc"].uint64_value() < align_shared_size(st->self, st->offset+st->size))
{
// Either empty, or shared and requires moving into a larger place (redirect-write)
@ -669,17 +701,18 @@ resume_7:
nfs_do_fsync(st, 8);
return;
}
resume_8:
// We always have to change inode entry on shared writes
st->self->parent->db->set(kv_inode_key(st->ino), new_shared_ientry(st), [st](int res)
{
st->res = res;
nfs_kv_continue_write(st, 8);
nfs_kv_continue_write(st, 9);
}, [st](int res, const std::string & old_value)
{
return res == 0 && old_value == st->ientry_text;
});
return;
resume_8:
resume_9:
if (st->res == -EAGAIN)
{
goto resume_0;
@ -690,28 +723,55 @@ resume_8:
}
// Fall through for non-shared
}
// Non-shared write
// Unshare?
if (st->ientry["shared_ino"].uint64_value() != 0)
{
// Unshare
resume_9:
if (!nfs_do_shared_readmodify(st, 9, state, true))
if (st->ientry["size"].uint64_value() != 0)
{
assert(!st->aligned_buf);
st->aligned_size = align_shared_size(st->self, st->ientry["size"].uint64_value());
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
nfs_do_shared_read(st, 10);
return;
nfs_do_unshare_write(st, 10);
return;
}
else
{
// Just write
nfs_do_align_write(st, st->ino, st->offset, 10);
}
resume_10:
nfs_do_unshare_write(st, 11);
return;
resume_11:
;
}
st->self->parent->db->set(kv_inode_key(st->ino), new_unshared_ientry(st), [st](int res)
{
st->res = res;
nfs_kv_continue_write(st, 12);
}, [st](int res, const std::string & old_value)
{
return res == 0 && old_value == st->ientry_text;
});
return;
resume_12:
if (st->res == -EAGAIN)
{
// Restart
goto resume_0;
}
if (st->res < 0)
{
auto cb = std::move(st->cb);
cb(st->res);
return;
}
st->ientry_text = new_unshared_ientry(st);
}
// Non-shared write
nfs_do_align_write(st, st->ino, st->offset, 13);
return;
resume_13:
if (st->res == 0 && st->stable && !st->was_immediate)
{
nfs_do_fsync(st, 11);
nfs_do_fsync(st, 14);
return;
}
resume_11:
resume_14:
if (st->res < 0)
{
auto cb = std::move(st->cb);
@ -724,7 +784,7 @@ resume_11:
{
st->ext = &st->self->parent->kvfs->extends[st->ino];
st->ext->refcnt++;
resume_12:
resume_15:
if (st->ext->next_extend < st->new_size)
{
// Aggregate inode extension requests
@ -733,15 +793,15 @@ resume_12:
if (st->ext->cur_extend > 0)
{
// Wait for current extend which is already in progress
st->ext->waiters.push_back([st](){ nfs_kv_continue_write(st, 12); });
st->ext->waiters.push_back([st](){ nfs_kv_continue_write(st, 15); });
return;
}
if (st->ext->done_extend < st->new_size)
{
nfs_kv_extend_inode(st, 0);
nfs_kv_extend_inode(st, 15, 15);
return;
resume_13:
nfs_kv_extend_inode(st, 1);
resume_16:
nfs_kv_extend_inode(st, 16, 15);
}
st->ext->refcnt--;
assert(st->ext->refcnt >= 0);
@ -769,6 +829,8 @@ int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop)
st->ino = kv_fh_inode(args->file);
st->offset = args->offset;
st->size = (args->count > args->data.size ? args->data.size : args->count);
if (st->self->parent->trace)
fprintf(stderr, "[%d] WRITE %ju %ju+%ju\n", st->self->nfs_fd, st->ino, st->offset, st->size);
if (!st->ino || st->size > MAX_REQUEST_SIZE)
{
*reply = (WRITE3res){ .status = NFS3ERR_INVAL };

View File

@ -98,7 +98,7 @@ void nfs_proxy_t::run(json11::Json cfg)
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
server_id = (uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62);
// Parse options
trace = cfg["log_level"].uint64_value() > 5;
trace = cfg["log_level"].uint64_value() > 5 || cfg["trace"].uint64_value() > 0;
bind_address = cfg["bind"].string_value();
if (bind_address == "")
bind_address = "0.0.0.0";
@ -156,7 +156,15 @@ void nfs_proxy_t::run(json11::Json cfg)
check_default_pool();
// Check if we're using VitastorFS
fs_kv_inode = cfg["fs"].uint64_value();
if (!fs_kv_inode && cfg["fs"].is_string())
if (fs_kv_inode)
{
if (!INODE_POOL(fs_kv_inode))
{
fprintf(stderr, "FS metadata inode number must include pool\n");
exit(1);
}
}
else if (cfg["fs"].is_string())
{
for (auto & ic: cli->st_cli.inode_config)
{
@ -166,6 +174,11 @@ void nfs_proxy_t::run(json11::Json cfg)
break;
}
}
if (!fs_kv_inode)
{
fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str());
exit(1);
}
}
readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value();
if (!readdir_getattr_parallel)

View File

@ -68,3 +68,5 @@ SCHEME=xor ./test_scrub.sh
PG_SIZE=3 ./test_scrub.sh
PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec ./test_scrub.sh
SCHEME=ec ./test_scrub.sh
./test_nfs.sh

149
tests/test_nfs.sh Executable file
View File

@ -0,0 +1,149 @@
#!/bin/bash -ex
PG_COUNT=16
. `dirname $0`/run_3osds.sh
build/src/vitastor-cli --etcd_address $ETCD_URL create -s 10G fsmeta
build/src/vitastor-nfs --fs fsmeta --etcd_address $ETCD_URL --portmap 0 --port 2050 --foreground 1 --trace 1 >>./testdata/nfs.log 2>&1 &
NFS_PID=$!
mkdir -p testdata/nfs
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
MNT=$(pwd)/testdata/nfs
trap "sudo umount -f $MNT"' || true; kill -9 $(jobs -p)' EXIT
# write small file
ls -l ./testdata/nfs
dd if=/dev/urandom of=./testdata/f1 bs=100k count=1
cp testdata/f1 ./testdata/nfs/
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
diff ./testdata/f1 ./testdata/nfs/f1
format_green "100K file ok"
# overwrite it inplace
dd if=/dev/urandom of=./testdata/f1_90k bs=90k count=1
cp testdata/f1_90k ./testdata/nfs/f1
sudo umount ./testdata/nfs/
format_green "inplace overwrite 90K ok"
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
# create another copy
dd if=./testdata/f1_90k of=./testdata/nfs/f1_nfs bs=1M
diff ./testdata/f1_90k ./testdata/nfs/f1_nfs
sudo umount ./testdata/nfs/
format_green "another copy 90K ok"
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
cp ./testdata/nfs/f1 ./testdata/f1_nfs
diff ./testdata/f1_90k ./testdata/nfs/f1
format_green "90K data ok"
# move it inplace
dd if=/dev/urandom of=./testdata/f1_110k bs=110k count=1
cp testdata/f1_110k ./testdata/nfs/f1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
diff ./testdata/f1_110k ./testdata/nfs/f1
format_green "move shared 90K -> 110K ok"
# extend it to large file + rm
dd if=/dev/urandom of=./testdata/f1_2M bs=2M count=1
cp ./testdata/f1_2M ./testdata/nfs/f1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
cp ./testdata/nfs/f1 ./testdata/f1_nfs
diff ./testdata/f1_2M ./testdata/nfs/f1
rm ./testdata/nfs/f1
format_green "extend to 2M + rm ok"
# mkdir
mkdir -p ./testdata/nfs/dir1/dir2
echo abcdef > ./testdata/nfs/dir1/dir2/hnpfls
# rename dir
mv ./testdata/nfs/dir1 ./testdata/nfs/dir3
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep dir3
ls -l ./testdata/nfs/dir3 | grep dir2
ls -l ./testdata/nfs/dir3/dir2 | grep hnpfls
echo abcdef > ./testdata/hnpfls
diff ./testdata/hnpfls ./testdata/nfs/dir3/dir2/hnpfls
format_green "rename dir with file ok"
# touch
touch -t 202401011404 ./testdata/nfs/dir3/dir2/hnpfls
sudo chown 65534:65534 ./testdata/nfs/dir3/dir2/hnpfls
sudo chmod 755 ./testdata/nfs/dir3/dir2/hnpfls
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
T=`stat -c '%a %u %g %y' ./testdata/nfs/dir3/dir2/hnpfls | perl -pe 's/(:\d+)(.*)/$1/'`
[[ "$T" = "755 65534 65534 2024-01-01 14:04" ]]
format_green "set attrs ok"
# move dir
mv ./testdata/nfs/dir3/dir2 ./testdata/nfs/
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep dir3
ls -l ./testdata/nfs | grep dir2
format_green "move dir ok"
# symlink, readlink
ln -s dir2 ./testdata/nfs/sym2
[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]]
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]]
[[ "`readlink ./testdata/nfs/sym2`" = "dir2" ]]
format_green "symlink, readlink ok"
# mknod: chr, blk, sock, fifo + remove
sudo mknod ./testdata/nfs/nod_chr c 1 5
sudo mknod ./testdata/nfs/nod_blk b 2 6
mkfifo ./testdata/nfs/nod_fifo
perl -e 'use Socket; socket($sock, PF_UNIX, SOCK_STREAM, undef) || die $!; bind($sock, sockaddr_un("./testdata/nfs/nod_sock")) || die $!;'
chmod 777 ./testdata/nfs/nod_*
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
[[ "`ls testdata|wc -l`" -ge 4 ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_blk`" = "brwxrwxrwx" ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_chr`" = "crwxrwxrwx" ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_fifo`" = "prwxrwxrwx" ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_sock`" = "srwxrwxrwx" ]]
sudo rm ./testdata/nfs/nod_*
format_green "mknod + rm ok"
# hardlink
echo ABCDEF > ./testdata/nfs/linked1
i=`stat -c '%i' ./testdata/nfs/linked1`
ln ./testdata/nfs/linked1 ./testdata/nfs/linked2
[[ "`stat -c '%i' ./testdata/nfs/linked2`" -eq $i ]]
echo BABABA > ./testdata/nfs/linked2
diff ./testdata/nfs/linked2 ./testdata/nfs/linked1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
diff ./testdata/nfs/linked2 ./testdata/nfs/linked1
[[ "`cat ./testdata/nfs/linked2`" = "BABABA" ]]
rm ./testdata/nfs/linked2
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
[[ "`cat ./testdata/nfs/linked1`" = "BABABA" ]]
format_green "hardlink ok"
# rm small
ls -l ./testdata/nfs
dd if=/dev/urandom of=./testdata/nfs/smallfile bs=100k count=1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
rm ./testdata/nfs/smallfile
if ls ./testdata/nfs | grep smallfile; then false; fi
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
if ls ./testdata/nfs | grep smallfile; then false; fi
format_green "rm small ok"
format_green OK