Add basic NFS tests, fix bugs

antietcd
Vitaliy Filippov 2024-02-25 02:27:10 +03:00
parent a574f9ad71
commit ecfc753e93
14 changed files with 367 additions and 86 deletions

View File

@ -22,7 +22,7 @@ RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \ RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get -y install jq lp-solve sudo RUN apt-get -y install jq lp-solve sudo nfs-common
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN set -ex; \ RUN set -ex; \

View File

@ -856,3 +856,21 @@ jobs:
echo "" echo ""
done done
test_nfs:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_nfs.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done

View File

@ -29,11 +29,29 @@ nfstime3 nfstime_from_str(const std::string & s)
return t; return t;
} }
static std::string timespec_to_str(timespec t)
{
char buf[64];
snprintf(buf, sizeof(buf), "%ju.%09ju", t.tv_sec, t.tv_nsec);
int l = strlen(buf);
while (l > 0 && buf[l-1] == '0')
l--;
if (l > 0 && buf[l-1] == '.')
l--;
buf[l] = 0;
return buf;
}
std::string nfstime_to_str(nfstime3 t) std::string nfstime_to_str(nfstime3 t)
{ {
char buf[32]; return timespec_to_str((timespec){ .tv_sec = t.seconds, .tv_nsec = t.nseconds });
snprintf(buf, sizeof(buf), "%u.%09u", t.seconds, t.nseconds); }
return buf;
std::string nfstime_now_str()
{
timespec t;
clock_gettime(CLOCK_REALTIME, &t);
return timespec_to_str(t);
} }
int kv_map_type(const std::string & type) int kv_map_type(const std::string & type)

View File

@ -46,7 +46,7 @@ struct kv_inode_extend_t
struct kv_fs_state_t struct kv_fs_state_t
{ {
std::map<list_cookie_t, list_cookie_val_t> list_cookies; std::map<list_cookie_t, list_cookie_val_t> list_cookies;
uint64_t fs_next_id = 0, fs_allocated_id = 0; uint64_t fs_next_id = 1, fs_allocated_id = 0;
std::vector<uint64_t> unallocated_ids; std::vector<uint64_t> unallocated_ids;
std::vector<shared_alloc_queue_t> allocating_shared; std::vector<shared_alloc_queue_t> allocating_shared;
uint64_t cur_shared_inode = 0, cur_shared_offset = 0; uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
@ -57,12 +57,13 @@ struct shared_file_header_t
{ {
uint64_t magic = 0; uint64_t magic = 0;
uint64_t inode = 0; uint64_t inode = 0;
uint64_t size = 0; uint64_t alloc = 0;
}; };
nfsstat3 vitastor_nfs_map_err(int err); nfsstat3 vitastor_nfs_map_err(int err);
nfstime3 nfstime_from_str(const std::string & s); nfstime3 nfstime_from_str(const std::string & s);
std::string nfstime_to_str(nfstime3 t); std::string nfstime_to_str(nfstime3 t);
std::string nfstime_now_str();
int kv_map_type(const std::string & type); int kv_map_type(const std::string & type);
fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs); fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs);
std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename); std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename);

View File

@ -94,6 +94,10 @@ static void kv_do_create(kv_create_state *st)
cb(-EINVAL); cb(-EINVAL);
return; return;
} }
if (st->attrobj.find("mtime") == st->attrobj.end())
st->attrobj["mtime"] = nfstime_now_str();
if (st->attrobj.find("atime") == st->attrobj.end())
st->attrobj["atime"] = st->attrobj["mtime"];
// Generate inode ID // Generate inode ID
allocate_new_id(st->self, [st](int res, uint64_t new_id) allocate_new_id(st->self, [st](int res, uint64_t new_id)
{ {
@ -105,7 +109,8 @@ static void kv_do_create(kv_create_state *st)
} }
st->new_id = new_id; st->new_id = new_id;
auto direntry = json11::Json::object{ { "ino", st->new_id } }; auto direntry = json11::Json::object{ { "ino", st->new_id } };
if (st->attrobj["type"].string_value() == "dir") if (st->attrobj.find("type") != st->attrobj.end() &&
st->attrobj["type"].string_value() == "dir")
{ {
direntry["type"] = "dir"; direntry["type"] = "dir";
} }

View File

@ -57,6 +57,8 @@ int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
} }
kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json attrs) kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json attrs)
{ {
if (self->parent->trace)
fprintf(stderr, "[%d] GETATTR %ju -> %s\n", self->nfs_fd, ino, value.c_str());
if (res < 0) if (res < 0)
{ {
*reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) }; *reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) };

View File

@ -85,8 +85,7 @@ resume_2:
return; return;
} }
auto hdr = ((shared_file_header_t*)st->aligned_buf); auto hdr = ((shared_file_header_t*)st->aligned_buf);
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino || if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino)
align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value()))
{ {
// Got unrelated data - retry from the beginning // Got unrelated data - retry from the beginning
free(st->aligned_buf); free(st->aligned_buf);
@ -101,7 +100,7 @@ resume_2:
} }
} }
st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1)); st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1));
st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment) & st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment-1) &
~(st->self->parent->pool_alignment-1)) - st->aligned_offset; ~(st->self->parent->pool_alignment-1)) - st->aligned_offset;
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
st->buf = st->aligned_buf + st->offset - st->aligned_offset; st->buf = st->aligned_buf + st->offset - st->aligned_offset;
@ -121,7 +120,7 @@ resume_2:
return; return;
resume_3: resume_3:
auto cb = std::move(st->cb); auto cb = std::move(st->cb);
cb(st->res); cb(st->res < 0 ? st->res : 0);
return; return;
} }
@ -157,6 +156,8 @@ int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop)
rpc_queue_reply(st->rop); rpc_queue_reply(st->rop);
delete st; delete st;
}; };
if (st->self->parent->trace)
fprintf(stderr, "[%d] READ %ju %ju+%ju\n", st->self->nfs_fd, st->ino, st->offset, st->size);
nfs_kv_continue_read(st, 0); nfs_kv_continue_read(st, 0);
return 1; return 1;
} }

View File

@ -188,7 +188,7 @@ resume_2:
auto lc_it = st->self->parent->kvfs->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie }); auto lc_it = st->self->parent->kvfs->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie });
if (lc_it != st->self->parent->kvfs->list_cookies.end()) if (lc_it != st->self->parent->kvfs->list_cookies.end())
{ {
st->start = lc_it->second.key; st->start = st->prefix+lc_it->second.key;
st->to_skip = 1; st->to_skip = 1;
st->offset = st->cookie; st->offset = st->cookie;
} }
@ -235,7 +235,7 @@ resume_2:
st->self->parent->db->list_next(st->list_handle, NULL); st->self->parent->db->list_next(st->list_handle, NULL);
return; return;
resume_3: resume_3:
if (st->res == -ENOENT || st->cur_key.size() > st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix) if (st->res == -ENOENT || st->cur_key.size() < st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix)
{ {
st->self->parent->db->list_close(st->list_handle); st->self->parent->db->list_close(st->list_handle);
st->list_handle = NULL; st->list_handle = NULL;
@ -256,6 +256,11 @@ resume_3:
} }
auto ino = direntry["ino"].uint64_value(); auto ino = direntry["ino"].uint64_value();
auto name = kv_direntry_filename(st->cur_key); auto name = kv_direntry_filename(st->cur_key);
if (st->self->parent->trace)
{
fprintf(stderr, "[%d] READDIR %ju %lu %s\n",
st->self->nfs_fd, st->dir_ino, st->offset, name.c_str());
}
auto fh = kv_fh(ino); auto fh = kv_fh(ino);
// 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes // 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
// 1 entryplus3 is (8+4+(filename_len+3)/4*4+8 // 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
@ -276,7 +281,7 @@ resume_3:
entry->name = xdr_copy_string(st->rop->xdrs, name); entry->name = xdr_copy_string(st->rop->xdrs, name);
entry->fileid = ino; entry->fileid = ino;
entry->cookie = st->offset++; entry->cookie = st->offset++;
st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = entry->name }; st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = name };
if (st->is_plus) if (st->is_plus)
{ {
entry->name_handle = (post_op_fh3){ entry->name_handle = (post_op_fh3){
@ -285,7 +290,6 @@ resume_3:
}; };
kv_getattr_next(st); kv_getattr_next(st);
} }
st->self->parent->db->list_next(st->list_handle, NULL);
} }
resume_4: resume_4:
while (st->getattr_running > 0) while (st->getattr_running > 0)

View File

@ -225,7 +225,7 @@ resume_6:
} }
// (6) If regular file and inode is deleted: delete data // (6) If regular file and inode is deleted: delete data
if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1 && if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1 &&
!st->ientry["shared_inode"].uint64_value()) !st->ientry["shared_ino"].uint64_value())
{ {
// Remove data // Remove data
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {

View File

@ -99,7 +99,8 @@ resume_2:
return; return;
} }
if (!st->set_attrs["size"].is_null() && if (!st->set_attrs["size"].is_null() &&
st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value()) st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value() &&
!st->ientry["shared_ino"].uint64_value())
{ {
// Delete extra data when downsizing // Delete extra data when downsizing
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
@ -147,11 +148,16 @@ int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid; st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid;
if (args->new_attributes.gid.set_it) if (args->new_attributes.gid.set_it)
st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid; st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid;
if (args->new_attributes.atime.set_it) if (args->new_attributes.atime.set_it == SET_TO_SERVER_TIME)
st->set_attrs["atime"] = nfstime_now_str();
else if (args->new_attributes.atime.set_it == SET_TO_CLIENT_TIME)
st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime); st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime);
if (args->new_attributes.mtime.set_it) if (args->new_attributes.mtime.set_it == SET_TO_SERVER_TIME)
st->set_attrs["mtime"] = nfstime_now_str();
else if (args->new_attributes.mtime.set_it == SET_TO_CLIENT_TIME)
st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime); st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime);
fprintf(stderr, "SETATTR %ju ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str()); if (st->self->parent->trace)
fprintf(stderr, "[%d] SETATTR %ju ATTRS %s\n", st->self->nfs_fd, st->ino, json11::Json(st->set_attrs).dump().c_str());
st->cb = [st](int res) st->cb = [st](int res)
{ {
auto reply = (SETATTR3res*)st->rop->reply; auto reply = (SETATTR3res*)st->rop->reply;

View File

@ -17,6 +17,7 @@ struct nfs_rmw_t
uint8_t *buf = NULL; uint8_t *buf = NULL;
uint64_t size = 0; uint64_t size = 0;
uint8_t *part_buf = NULL; uint8_t *part_buf = NULL;
uint64_t version = 0;
}; };
struct nfs_kv_write_state struct nfs_kv_write_state
@ -106,6 +107,14 @@ static void allocate_shared_inode(nfs_kv_write_state *st, int state, uint64_t si
); );
}); });
} }
else
{
st->res = 0;
st->shared_inode = st->self->parent->kvfs->cur_shared_inode;
st->shared_offset = st->self->parent->kvfs->cur_shared_offset;
st->self->parent->kvfs->cur_shared_offset += (size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1);
nfs_kv_continue_write(st, state);
}
} }
uint64_t align_shared_size(nfs_client_t *self, uint64_t size) uint64_t align_shared_size(nfs_client_t *self, uint64_t size)
@ -154,10 +163,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
{ {
auto parent = rmw->st->self->parent; auto parent = rmw->st->self->parent;
auto align = parent->pool_alignment; auto align = parent->pool_alignment;
bool is_begin = (rmw->offset % align); assert(rmw->size < align);
bool is_end = ((rmw->offset+rmw->size) % align);
// RMW either only at beginning or only at end and within a single block
assert(is_begin != is_end);
assert((rmw->offset/parent->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->pool_block_size)); assert((rmw->offset/parent->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->pool_block_size));
if (!rmw->part_buf) if (!rmw->part_buf)
{ {
@ -166,7 +172,7 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
auto op = new cluster_op_t; auto op = new cluster_op_t;
op->opcode = OSD_OP_READ; op->opcode = OSD_OP_READ;
op->inode = parent->fs_base_inode + rmw->ino; op->inode = parent->fs_base_inode + rmw->ino;
op->offset = (rmw->offset + (is_begin ? 0 : rmw->size)) & ~(align-1); op->offset = rmw->offset & ~(align-1);
op->len = align; op->len = align;
op->iov.push_back(rmw->part_buf, op->len); op->iov.push_back(rmw->part_buf, op->len);
rmw->st->waiting++; rmw->st->waiting++;
@ -185,30 +191,43 @@ static void nfs_do_rmw(nfs_rmw_t *rmw)
} }
else else
{ {
if (!rmw->version)
{
auto st = rmw->st;
rmw->version = rd_op->version+1;
if (st->rmw[0].st && st->rmw[1].st &&
st->rmw[0].offset/st->self->parent->pool_block_size == st->rmw[1].offset/st->self->parent->pool_block_size)
{
// Same block... RMWs should be sequential
int other = rmw == &st->rmw[0] ? 1 : 0;
st->rmw[other].version = rmw->version+1;
}
}
auto parent = rmw->st->self->parent; auto parent = rmw->st->self->parent;
auto align = parent->pool_alignment; auto align = parent->pool_alignment;
bool is_begin = (rmw->offset % align); bool is_begin = (rmw->offset % align);
bool is_end = ((rmw->offset+rmw->size) % align);
auto op = new cluster_op_t; auto op = new cluster_op_t;
op->opcode = OSD_OP_WRITE; op->opcode = OSD_OP_WRITE;
op->inode = rmw->st->self->parent->fs_base_inode + rmw->ino; op->inode = rmw->st->self->parent->fs_base_inode + rmw->ino;
op->offset = rmw->offset & ~(align-1); op->offset = rmw->offset & ~(align-1);
op->len = (rmw->size + align-1) & ~(align-1); op->len = align;
op->version = rd_op->version+1; op->version = rmw->version;
if (is_begin) if (is_begin)
{ {
op->iov.push_back(rmw->part_buf, rmw->offset % align); op->iov.push_back(rmw->part_buf, rmw->offset % align);
} }
op->iov.push_back(rmw->buf, rmw->size); op->iov.push_back(rmw->buf, rmw->size);
if (!is_begin) if (is_end)
{ {
auto tail = ((rmw->offset+rmw->size) % align); op->iov.push_back(rmw->part_buf + (rmw->offset % align) + rmw->size, align - (rmw->offset % align) - rmw->size);
op->iov.push_back(rmw->part_buf + tail, align - tail);
} }
op->callback = [rmw](cluster_op_t *op) op->callback = [rmw](cluster_op_t *op)
{ {
if (op->retval == -EAGAIN) if (op->retval == -EINTR)
{ {
// CAS failure - retry // CAS failure - retry
rmw->version = 0;
rmw->st->waiting--; rmw->st->waiting--;
nfs_do_rmw(rmw); nfs_do_rmw(rmw);
} }
@ -272,11 +291,12 @@ static bool nfs_do_shared_readmodify(nfs_kv_write_state *st, int base_state, int
else if (state == base_state) goto resume_0; else if (state == base_state) goto resume_0;
assert(!st->aligned_buf); assert(!st->aligned_buf);
st->aligned_size = unshare st->aligned_size = unshare
? sizeof(shared_file_header_t) + (st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1) ? sizeof(shared_file_header_t) + ((st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1))
: align_shared_size(st->self, st->new_size); : align_shared_size(st->self, st->new_size);
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
memset(st->aligned_buf + sizeof(shared_file_header_t), 0, st->offset); memset(st->aligned_buf + sizeof(shared_file_header_t), 0, st->offset);
if (st->ientry["shared_ino"].uint64_value() != 0) if (st->ientry["shared_ino"].uint64_value() != 0 &&
st->ientry["size"].uint64_value() != 0)
{ {
// Read old data if shared non-empty // Read old data if shared non-empty
nfs_do_shared_read(st, base_state); nfs_do_shared_read(st, base_state);
@ -286,14 +306,15 @@ resume_0:
{ {
auto cb = std::move(st->cb); auto cb = std::move(st->cb);
cb(st->res); cb(st->res);
return true; return false;
} }
auto hdr = ((shared_file_header_t*)st->aligned_buf); auto hdr = ((shared_file_header_t*)st->aligned_buf);
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino || if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino)
align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value()))
{ {
// Got unrelated data - retry from the beginning // Got unrelated data - retry from the beginning
st->allow_cache = false; st->allow_cache = false;
free(st->aligned_buf);
st->aligned_buf = NULL;
nfs_kv_continue_write(st, 0); nfs_kv_continue_write(st, 0);
return false; return false;
} }
@ -301,7 +322,7 @@ resume_0:
*((shared_file_header_t*)st->aligned_buf) = { *((shared_file_header_t*)st->aligned_buf) = {
.magic = SHARED_FILE_MAGIC_V1, .magic = SHARED_FILE_MAGIC_V1,
.inode = st->ino, .inode = st->ino,
.size = st->new_size, .alloc = st->aligned_size,
}; };
memcpy(st->aligned_buf + sizeof(shared_file_header_t) + st->offset, st->buf, st->size); memcpy(st->aligned_buf + sizeof(shared_file_header_t) + st->offset, st->buf, st->size);
memset(st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size, 0, memset(st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size, 0,
@ -316,6 +337,8 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of
uint64_t good_offset = offset; uint64_t good_offset = offset;
uint64_t good_size = st->size; uint64_t good_size = st->size;
st->waiting++; st->waiting++;
st->rmw[0].st = NULL;
st->rmw[1].st = NULL;
if (offset % alignment) if (offset % alignment)
{ {
// Requires read-modify-write in the beginning // Requires read-modify-write in the beginning
@ -337,17 +360,18 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of
.buf = st->buf, .buf = st->buf,
.size = s, .size = s,
}; };
// FIXME: skip rmw at shared beginning
nfs_do_rmw(&st->rmw[0]); nfs_do_rmw(&st->rmw[0]);
} }
if ((offset+st->size-1) % alignment) if ((offset+st->size) % alignment)
{ {
// Requires read-modify-write in the end // Requires read-modify-write in the end
auto s = ((offset+st->size-1) % alignment); auto s = ((offset+st->size) % alignment);
if (good_size > s) if (good_size > s)
good_size -= s; good_size -= s;
else else
good_size = 0; good_size = 0;
if (((offset+st->size-1) / alignment) > (offset / alignment)) if ((offset+st->size)/alignment > offset/alignment)
{ {
st->rmw[1] = { st->rmw[1] = {
.st = st, .st = st,
@ -357,6 +381,7 @@ static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t of
.buf = st->buf + st->size-s, .buf = st->buf + st->size-s,
.size = s, .size = s,
}; };
// FIXME: skip rmw at end
nfs_do_rmw(&st->rmw[1]); nfs_do_rmw(&st->rmw[1]);
} }
} }
@ -405,19 +430,28 @@ static std::string new_shared_ientry(nfs_kv_write_state *st)
return json11::Json(ni).dump(); return json11::Json(ni).dump();
} }
static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state) static std::string new_unshared_ientry(nfs_kv_write_state *st)
{ {
if (state == 1) auto ni = st->ientry.object_items();
{ ni.erase("empty");
ni.erase("shared_ino");
ni.erase("shared_offset");
ni.erase("shared_alloc");
ni.erase("shared_ver");
return json11::Json(ni).dump();
}
static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state, int base_state)
{
if (state == base_state+1)
goto resume_1; goto resume_1;
}
st->ext->cur_extend = st->ext->next_extend; st->ext->cur_extend = st->ext->next_extend;
st->ext->next_extend = 0; st->ext->next_extend = 0;
st->res2 = -EAGAIN; st->res2 = -EAGAIN;
st->self->parent->db->set(kv_inode_key(st->ino), new_normal_ientry(st), [st](int res) st->self->parent->db->set(kv_inode_key(st->ino), new_normal_ientry(st), [st, base_state](int res)
{ {
st->res = res; st->res = res;
nfs_kv_continue_write(st, 13); nfs_kv_continue_write(st, base_state+1);
}, [st](int res, const std::string & old_value) }, [st](int res, const std::string & old_value)
{ {
if (res != 0) if (res != 0)
@ -432,6 +466,7 @@ static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state)
auto ientry = json11::Json::parse(old_value, err).object_items(); auto ientry = json11::Json::parse(old_value, err).object_items();
if (err != "") if (err != "")
{ {
fprintf(stderr, "Invalid JSON in inode %lu = %s: %s\n", st->ino, old_value.c_str(), err.c_str());
st->res2 = -EINVAL; st->res2 = -EINVAL;
return false; return false;
} }
@ -501,24 +536,20 @@ resume_1:
// - In parallel: check if data fits into inode size and extend if it doesn't // - In parallel: check if data fits into inode size and extend if it doesn't
// - If CAS failure: re-read inode and retry to extend the size // - If CAS failure: re-read inode and retry to extend the size
// - If shared: // - If shared:
// - Read whole file from shared inode
// - If the file header in data doesn't match: re-read inode and restart
// - If data doesn't fit into the same shared inode: // - If data doesn't fit into the same shared inode:
// - Allocate space in a new shared inode // - Allocate space in a new shared inode
// - Read whole file from shared inode
// - Write data into the new shared inode // - Write data into the new shared inode
// - If CAS failure: allocate another shared inode and retry // - If CAS failure: allocate another shared inode and retry
// - Update inode metadata (set new size and new shared inode) // - Update inode metadata (set new size and new shared inode)
// - If CAS failure: free allocated shared space, re-read inode and restart // - If CAS failure: free allocated shared space, re-read inode and restart
// - If it fits: // - If it fits:
// - Write updated data into the shared inode // - Update shared inode data in-place
// - Update inode entry in any case to block parallel non-shared writes // - Update inode entry in any case to block parallel non-shared writes
// - If CAS failure: re-read inode and restart // - If CAS failure: re-read inode and restart
// - Otherwise: // - Otherwise:
// - Write data into non-shared inode // - Read inode
// - Read inode in parallel // - If not a regular file - stop with -EINVAL
// - If not a regular file:
// - Remove data
// - Stop with -EINVAL
// - If shared: // - If shared:
// - Read whole file from shared inode // - Read whole file from shared inode
// - Write data into non-shared inode // - Write data into non-shared inode
@ -526,9 +557,9 @@ resume_1:
// - Update inode metadata (make non-shared, update size) // - Update inode metadata (make non-shared, update size)
// - If CAS failure: restart // - If CAS failure: restart
// - Zero out the shared inode header // - Zero out the shared inode header
// - If CAS failure: restart // - Write data into non-shared inode
// - Check if size fits // - Check if size fits
// - Extend if it doesn't // - Extend if it doesn't
// Read: // Read:
// - If (offset+size <= threshold): // - If (offset+size <= threshold):
// - Read inode from cache // - Read inode from cache
@ -557,6 +588,9 @@ static void nfs_kv_continue_write(nfs_kv_write_state *st, int state)
else if (state == 11) goto resume_11; else if (state == 11) goto resume_11;
else if (state == 12) goto resume_12; else if (state == 12) goto resume_12;
else if (state == 13) goto resume_13; else if (state == 13) goto resume_13;
else if (state == 14) goto resume_14;
else if (state == 15) goto resume_15;
else if (state == 16) goto resume_16;
else else
{ {
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_write()"); fprintf(stderr, "BUG: invalid state in nfs_kv_continue_write()");
@ -578,9 +612,7 @@ resume_0:
}, st->allow_cache); }, st->allow_cache);
return; return;
resume_1: resume_1:
if (st->res < 0 || if (st->res < 0 || kv_map_type(st->ientry["type"].string_value()) != NF3REG)
st->ientry["type"].uint64_value() != 0 &&
st->ientry["type"].uint64_value() != NF3REG)
{ {
auto cb = std::move(st->cb); auto cb = std::move(st->cb);
cb(st->res == 0 ? -EINVAL : st->res); cb(st->res == 0 ? -EINVAL : st->res);
@ -594,11 +626,11 @@ resume_1:
} }
if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold) if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold)
{ {
if (st->ientry["size"].uint64_value() == 0 || if (st->ientry["size"].uint64_value() == 0 &&
st->ientry["shared_ino"].uint64_value() == 0 ||
st->ientry["empty"].bool_value() && st->ientry["empty"].bool_value() &&
st->ientry["size"].uint64_value() + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold || (st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->shared_inode_threshold ||
st->ientry["shared_ino"].uint64_value() != 0 && st->ientry["shared_ino"].uint64_value() != 0 &&
st->ientry["size"].uint64_value() < st->offset+st->size &&
st->ientry["shared_alloc"].uint64_value() < align_shared_size(st->self, st->offset+st->size)) st->ientry["shared_alloc"].uint64_value() < align_shared_size(st->self, st->offset+st->size))
{ {
// Either empty, or shared and requires moving into a larger place (redirect-write) // Either empty, or shared and requires moving into a larger place (redirect-write)
@ -669,17 +701,18 @@ resume_7:
nfs_do_fsync(st, 8); nfs_do_fsync(st, 8);
return; return;
} }
resume_8:
// We always have to change inode entry on shared writes // We always have to change inode entry on shared writes
st->self->parent->db->set(kv_inode_key(st->ino), new_shared_ientry(st), [st](int res) st->self->parent->db->set(kv_inode_key(st->ino), new_shared_ientry(st), [st](int res)
{ {
st->res = res; st->res = res;
nfs_kv_continue_write(st, 8); nfs_kv_continue_write(st, 9);
}, [st](int res, const std::string & old_value) }, [st](int res, const std::string & old_value)
{ {
return res == 0 && old_value == st->ientry_text; return res == 0 && old_value == st->ientry_text;
}); });
return; return;
resume_8: resume_9:
if (st->res == -EAGAIN) if (st->res == -EAGAIN)
{ {
goto resume_0; goto resume_0;
@ -690,28 +723,55 @@ resume_8:
} }
// Fall through for non-shared // Fall through for non-shared
} }
// Non-shared write // Unshare?
if (st->ientry["shared_ino"].uint64_value() != 0) if (st->ientry["shared_ino"].uint64_value() != 0)
{ {
// Unshare if (st->ientry["size"].uint64_value() != 0)
resume_9: {
if (!nfs_do_shared_readmodify(st, 9, state, true)) assert(!st->aligned_buf);
st->aligned_size = align_shared_size(st->self, st->ientry["size"].uint64_value());
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
nfs_do_shared_read(st, 10);
return; return;
nfs_do_unshare_write(st, 10);
return;
}
else
{
// Just write
nfs_do_align_write(st, st->ino, st->offset, 10);
}
resume_10: resume_10:
nfs_do_unshare_write(st, 11);
return;
resume_11:
;
}
st->self->parent->db->set(kv_inode_key(st->ino), new_unshared_ientry(st), [st](int res)
{
st->res = res;
nfs_kv_continue_write(st, 12);
}, [st](int res, const std::string & old_value)
{
return res == 0 && old_value == st->ientry_text;
});
return;
resume_12:
if (st->res == -EAGAIN)
{
// Restart
goto resume_0;
}
if (st->res < 0)
{
auto cb = std::move(st->cb);
cb(st->res);
return;
}
st->ientry_text = new_unshared_ientry(st);
}
// Non-shared write
nfs_do_align_write(st, st->ino, st->offset, 13);
return;
resume_13:
if (st->res == 0 && st->stable && !st->was_immediate) if (st->res == 0 && st->stable && !st->was_immediate)
{ {
nfs_do_fsync(st, 11); nfs_do_fsync(st, 14);
return; return;
} }
resume_11: resume_14:
if (st->res < 0) if (st->res < 0)
{ {
auto cb = std::move(st->cb); auto cb = std::move(st->cb);
@ -724,7 +784,7 @@ resume_11:
{ {
st->ext = &st->self->parent->kvfs->extends[st->ino]; st->ext = &st->self->parent->kvfs->extends[st->ino];
st->ext->refcnt++; st->ext->refcnt++;
resume_12: resume_15:
if (st->ext->next_extend < st->new_size) if (st->ext->next_extend < st->new_size)
{ {
// Aggregate inode extension requests // Aggregate inode extension requests
@ -733,15 +793,15 @@ resume_12:
if (st->ext->cur_extend > 0) if (st->ext->cur_extend > 0)
{ {
// Wait for current extend which is already in progress // Wait for current extend which is already in progress
st->ext->waiters.push_back([st](){ nfs_kv_continue_write(st, 12); }); st->ext->waiters.push_back([st](){ nfs_kv_continue_write(st, 15); });
return; return;
} }
if (st->ext->done_extend < st->new_size) if (st->ext->done_extend < st->new_size)
{ {
nfs_kv_extend_inode(st, 0); nfs_kv_extend_inode(st, 15, 15);
return; return;
resume_13: resume_16:
nfs_kv_extend_inode(st, 1); nfs_kv_extend_inode(st, 16, 15);
} }
st->ext->refcnt--; st->ext->refcnt--;
assert(st->ext->refcnt >= 0); assert(st->ext->refcnt >= 0);
@ -769,6 +829,8 @@ int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop)
st->ino = kv_fh_inode(args->file); st->ino = kv_fh_inode(args->file);
st->offset = args->offset; st->offset = args->offset;
st->size = (args->count > args->data.size ? args->data.size : args->count); st->size = (args->count > args->data.size ? args->data.size : args->count);
if (st->self->parent->trace)
fprintf(stderr, "[%d] WRITE %ju %ju+%ju\n", st->self->nfs_fd, st->ino, st->offset, st->size);
if (!st->ino || st->size > MAX_REQUEST_SIZE) if (!st->ino || st->size > MAX_REQUEST_SIZE)
{ {
*reply = (WRITE3res){ .status = NFS3ERR_INVAL }; *reply = (WRITE3res){ .status = NFS3ERR_INVAL };

View File

@ -98,7 +98,7 @@ void nfs_proxy_t::run(json11::Json cfg)
srand48(tv.tv_sec*1000000000 + tv.tv_nsec); srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
server_id = (uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62); server_id = (uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62);
// Parse options // Parse options
trace = cfg["log_level"].uint64_value() > 5; trace = cfg["log_level"].uint64_value() > 5 || cfg["trace"].uint64_value() > 0;
bind_address = cfg["bind"].string_value(); bind_address = cfg["bind"].string_value();
if (bind_address == "") if (bind_address == "")
bind_address = "0.0.0.0"; bind_address = "0.0.0.0";
@ -156,7 +156,15 @@ void nfs_proxy_t::run(json11::Json cfg)
check_default_pool(); check_default_pool();
// Check if we're using VitastorFS // Check if we're using VitastorFS
fs_kv_inode = cfg["fs"].uint64_value(); fs_kv_inode = cfg["fs"].uint64_value();
if (!fs_kv_inode && cfg["fs"].is_string()) if (fs_kv_inode)
{
if (!INODE_POOL(fs_kv_inode))
{
fprintf(stderr, "FS metadata inode number must include pool\n");
exit(1);
}
}
else if (cfg["fs"].is_string())
{ {
for (auto & ic: cli->st_cli.inode_config) for (auto & ic: cli->st_cli.inode_config)
{ {
@ -166,6 +174,11 @@ void nfs_proxy_t::run(json11::Json cfg)
break; break;
} }
} }
if (!fs_kv_inode)
{
fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str());
exit(1);
}
} }
readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value(); readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value();
if (!readdir_getattr_parallel) if (!readdir_getattr_parallel)

View File

@ -68,3 +68,5 @@ SCHEME=xor ./test_scrub.sh
PG_SIZE=3 ./test_scrub.sh PG_SIZE=3 ./test_scrub.sh
PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec ./test_scrub.sh PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec ./test_scrub.sh
SCHEME=ec ./test_scrub.sh SCHEME=ec ./test_scrub.sh
./test_nfs.sh

149
tests/test_nfs.sh Executable file
View File

@ -0,0 +1,149 @@
#!/bin/bash -ex
PG_COUNT=16
. `dirname $0`/run_3osds.sh
build/src/vitastor-cli --etcd_address $ETCD_URL create -s 10G fsmeta
build/src/vitastor-nfs --fs fsmeta --etcd_address $ETCD_URL --portmap 0 --port 2050 --foreground 1 --trace 1 >>./testdata/nfs.log 2>&1 &
NFS_PID=$!
mkdir -p testdata/nfs
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
MNT=$(pwd)/testdata/nfs
trap "sudo umount -f $MNT"' || true; kill -9 $(jobs -p)' EXIT
# write small file
ls -l ./testdata/nfs
dd if=/dev/urandom of=./testdata/f1 bs=100k count=1
cp testdata/f1 ./testdata/nfs/
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
diff ./testdata/f1 ./testdata/nfs/f1
format_green "100K file ok"
# overwrite it inplace
dd if=/dev/urandom of=./testdata/f1_90k bs=90k count=1
cp testdata/f1_90k ./testdata/nfs/f1
sudo umount ./testdata/nfs/
format_green "inplace overwrite 90K ok"
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
# create another copy
dd if=./testdata/f1_90k of=./testdata/nfs/f1_nfs bs=1M
diff ./testdata/f1_90k ./testdata/nfs/f1_nfs
sudo umount ./testdata/nfs/
format_green "another copy 90K ok"
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
cp ./testdata/nfs/f1 ./testdata/f1_nfs
diff ./testdata/f1_90k ./testdata/nfs/f1
format_green "90K data ok"
# move it inplace
dd if=/dev/urandom of=./testdata/f1_110k bs=110k count=1
cp testdata/f1_110k ./testdata/nfs/f1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
diff ./testdata/f1_110k ./testdata/nfs/f1
format_green "move shared 90K -> 110K ok"
# extend it to large file + rm
dd if=/dev/urandom of=./testdata/f1_2M bs=2M count=1
cp ./testdata/f1_2M ./testdata/nfs/f1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep f1
cp ./testdata/nfs/f1 ./testdata/f1_nfs
diff ./testdata/f1_2M ./testdata/nfs/f1
rm ./testdata/nfs/f1
format_green "extend to 2M + rm ok"
# mkdir
mkdir -p ./testdata/nfs/dir1/dir2
echo abcdef > ./testdata/nfs/dir1/dir2/hnpfls
# rename dir
mv ./testdata/nfs/dir1 ./testdata/nfs/dir3
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep dir3
ls -l ./testdata/nfs/dir3 | grep dir2
ls -l ./testdata/nfs/dir3/dir2 | grep hnpfls
echo abcdef > ./testdata/hnpfls
diff ./testdata/hnpfls ./testdata/nfs/dir3/dir2/hnpfls
format_green "rename dir with file ok"
# touch
touch -t 202401011404 ./testdata/nfs/dir3/dir2/hnpfls
sudo chown 65534:65534 ./testdata/nfs/dir3/dir2/hnpfls
sudo chmod 755 ./testdata/nfs/dir3/dir2/hnpfls
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
T=`stat -c '%a %u %g %y' ./testdata/nfs/dir3/dir2/hnpfls | perl -pe 's/(:\d+)(.*)/$1/'`
[[ "$T" = "755 65534 65534 2024-01-01 14:04" ]]
format_green "set attrs ok"
# move dir
mv ./testdata/nfs/dir3/dir2 ./testdata/nfs/
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
ls -l ./testdata/nfs | grep dir3
ls -l ./testdata/nfs | grep dir2
format_green "move dir ok"
# symlink, readlink
ln -s dir2 ./testdata/nfs/sym2
[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]]
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]]
[[ "`readlink ./testdata/nfs/sym2`" = "dir2" ]]
format_green "symlink, readlink ok"
# mknod: chr, blk, sock, fifo + remove
sudo mknod ./testdata/nfs/nod_chr c 1 5
sudo mknod ./testdata/nfs/nod_blk b 2 6
mkfifo ./testdata/nfs/nod_fifo
perl -e 'use Socket; socket($sock, PF_UNIX, SOCK_STREAM, undef) || die $!; bind($sock, sockaddr_un("./testdata/nfs/nod_sock")) || die $!;'
chmod 777 ./testdata/nfs/nod_*
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
[[ "`ls testdata|wc -l`" -ge 4 ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_blk`" = "brwxrwxrwx" ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_chr`" = "crwxrwxrwx" ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_fifo`" = "prwxrwxrwx" ]]
[[ "`stat -c '%A' ./testdata/nfs/nod_sock`" = "srwxrwxrwx" ]]
sudo rm ./testdata/nfs/nod_*
format_green "mknod + rm ok"
# hardlink
echo ABCDEF > ./testdata/nfs/linked1
i=`stat -c '%i' ./testdata/nfs/linked1`
ln ./testdata/nfs/linked1 ./testdata/nfs/linked2
[[ "`stat -c '%i' ./testdata/nfs/linked2`" -eq $i ]]
echo BABABA > ./testdata/nfs/linked2
diff ./testdata/nfs/linked2 ./testdata/nfs/linked1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
diff ./testdata/nfs/linked2 ./testdata/nfs/linked1
[[ "`cat ./testdata/nfs/linked2`" = "BABABA" ]]
rm ./testdata/nfs/linked2
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
[[ "`cat ./testdata/nfs/linked1`" = "BABABA" ]]
format_green "hardlink ok"
# rm small
ls -l ./testdata/nfs
dd if=/dev/urandom of=./testdata/nfs/smallfile bs=100k count=1
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
rm ./testdata/nfs/smallfile
if ls ./testdata/nfs | grep smallfile; then false; fi
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
if ls ./testdata/nfs | grep smallfile; then false; fi
format_green "rm small ok"
format_green OK