WIP FS readdir
Test / buildenv (push) Successful in 10s Details
Test / build (push) Failing after 1m39s Details
Test / make_test (push) Has been skipped Details
Test / test_add_osd (push) Has been skipped Details
Test / test_cas (push) Has been skipped Details
Test / test_change_pg_count (push) Has been skipped Details
Test / test_change_pg_count_ec (push) Has been skipped Details
Test / test_change_pg_size (push) Has been skipped Details
Test / test_create_nomaxid (push) Has been skipped Details
Test / test_etcd_fail (push) Has been skipped Details
Test / test_interrupted_rebalance (push) Has been skipped Details
Test / test_interrupted_rebalance_imm (push) Has been skipped Details
Test / test_interrupted_rebalance_ec (push) Has been skipped Details
Test / test_interrupted_rebalance_ec_imm (push) Has been skipped Details
Test / test_failure_domain (push) Has been skipped Details
Test / test_snapshot (push) Has been skipped Details
Test / test_snapshot_ec (push) Has been skipped Details
Test / test_minsize_1 (push) Has been skipped Details
Test / test_move_reappear (push) Has been skipped Details
Test / test_rm (push) Has been skipped Details
Test / test_snapshot_chain (push) Has been skipped Details
Test / test_snapshot_chain_ec (push) Has been skipped Details
Test / test_snapshot_down (push) Has been skipped Details
Test / test_snapshot_down_ec (push) Has been skipped Details
Test / test_splitbrain (push) Has been skipped Details
Test / test_rebalance_verify (push) Has been skipped Details
Test / test_rebalance_verify_imm (push) Has been skipped Details
Test / test_rebalance_verify_ec (push) Has been skipped Details
Test / test_rebalance_verify_ec_imm (push) Has been skipped Details
Test / test_write (push) Has been skipped Details
Test / test_write_xor (push) Has been skipped Details
Test / test_write_no_same (push) Has been skipped Details
Test / test_heal_pg_size_2 (push) Has been skipped Details
Test / test_heal_ec (push) Has been skipped Details
Test / test_heal_csum_32k_dmj (push) Has been skipped Details
Test / test_heal_csum_32k_dj (push) Has been skipped Details
Test / test_heal_csum_32k (push) Has been skipped Details
Test / test_heal_csum_4k_dmj (push) Has been skipped Details
Test / test_heal_csum_4k_dj (push) Has been skipped Details
Test / test_heal_csum_4k (push) Has been skipped Details
Test / test_scrub (push) Has been skipped Details
Test / test_scrub_zero_osd_2 (push) Has been skipped Details
Test / test_scrub_xor (push) Has been skipped Details
Test / test_scrub_pg_size_3 (push) Has been skipped Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been skipped Details
Test / test_scrub_ec (push) Has been skipped Details

Vitaliy Filippov 2024-01-08 01:41:29 +03:00
parent 0a10114469
commit 2da50a7b5a
1 changed files with 250 additions and 157 deletions

View File

@ -1486,6 +1486,7 @@ static int nfs3_rename_proc(void *opaque, rpc_op_t *rop)
{
auto st = new nfs_kv_rename_state;
st->self = (nfs_client_t*)opaque;
st->rop = rop;
RENAME3args *args = (RENAME3args*)rop->request;
st->old_dir_ino = kv_fh_inode(args->from.dir);
st->new_dir_ino = kv_fh_inode(args->to.dir);
@ -1521,6 +1522,11 @@ static int nfs3_rename_proc(void *opaque, rpc_op_t *rop)
static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
{
// 1) Find the source file
// 2) If it's a directory - fail with -EISDIR
// 3) Update the inode entry with refcount++
// 5) Create the new direntry with the same inode reference
// Fail and rollback refcount if it already exists
//nfs_client_t *self = (nfs_client_t*)opaque;
//LINK3args *args = (LINK3args*)rop->request;
LINK3res *reply = (LINK3res*)rop->reply;
@ -1530,126 +1536,225 @@ static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
return 0;
}
static void fill_dir_entry(nfs_client_t *self, rpc_op_t *rop,
std::map<std::string, nfs_dir_t>::iterator dir_id_it, struct entryplus3 *entry, bool is_plus)
struct nfs_kv_readdir_state
{
if (dir_id_it == self->parent->dir_info.end())
nfs_client_t *self = NULL;
rpc_op_t *rop = NULL;
bool is_plus = false;
READDIRPLUS3args args;
uint64_t dir_ino = 0;
std::string prefix;
void *list_handle;
bool eof = false;
int reply_size = 0;
std::vector<entryplus3> entries;
};
static void kv_getattr_next(nfs_kv_readdir_state *st)
{
while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->readdir_getattr_parallel)
{
return;
}
entry->fileid = dir_id_it->second.id;
if (is_plus)
{
entry->name_attributes = (post_op_attr){
.attributes_follow = 1,
.attributes = get_dir_attributes(self, dir_id_it->first),
};
entry->name_handle = (post_op_fh3){
.handle_follows = 1,
.handle = xdr_copy_string(rop->xdrs, "S"+base64_encode(sha256(dir_id_it->first))),
};
auto idx = st->getattr_cur++;
st->getattr_running++;
kv_read_inode(st->self, st->entries[idx]->fileid, [st, idx](int res, const std::string & value, json11::Json ientry)
{
if (res == 0)
{
st->entries[idx]->name_attributes = (post_op_attr){
// FIXME: maybe do not read parent attributes and leave them to a GETATTR?
.attributes_follow = 1,
.attributes = get_kv_attributes(self, st->entries[idx]->fileid, ientry),
};
}
st->getattr_running--;
kv_getattr_next(st);
if (st->getattr_running == 0 && !st->list_handle)
{
nfs_kv_continue_readdir(st, 4);
}
});
}
}
static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state)
{
nfs_client_t *self = (nfs_client_t*)opaque;
READDIRPLUS3args plus_args;
READDIRPLUS3args *args = NULL;
if (is_plus)
args = ((READDIRPLUS3args*)rop->request);
if (state == 0) {}
else if (state == 1) goto resume_1;
else if (state == 2) goto resume_2;
else if (state == 3) goto resume_3;
else if (state == 4) goto resume_4;
else
{
args = &plus_args;
READDIR3args *in_args = ((READDIR3args*)rop->request);
args->dir = in_args->dir;
args->cookie = in_args->cookie;
*((uint64_t*)args->cookieverf) = *((uint64_t*)in_args->cookieverf);
args->dircount = 512;
args->maxcount = in_args->count;
fprintf("BUG: invalid state in nfs_kv_continue_readdir()");
abort();
}
std::string dirhash = args->dir;
std::string dir;
if (dirhash != "roothandle")
st->prefix = kv_direntry_key(dir_ino, "");
st->eof = true;
// Limit results based on maximum reply size
// Sadly we have to calculate reply size by hand
// reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes
st->reply_size = 20;
if (reply_size > st->args.maxcount)
{
auto dir_it = self->parent->dir_by_hash.find(dirhash);
if (dir_it != self->parent->dir_by_hash.end())
dir = dir_it->second;
}
std::string prefix = dir.size() ? dir+"/" : self->parent->name_prefix;
std::map<std::string, struct entryplus3> entries;
for (auto & ic: self->parent->cli->st_cli.inode_config)
{
auto & inode_cfg = ic.second;
if (prefix != "" && inode_cfg.name.substr(0, prefix.size()) != prefix)
continue;
std::string subname = inode_cfg.name.substr(prefix.size());
int p = 0;
while (p < subname.size() && subname[p] == '/')
p++;
if (p > 0)
subname = subname.substr(p);
if (subname.size() == 0)
continue;
p = 0;
while (p < subname.size() && subname[p] != '/')
p++;
if (p >= subname.size())
{
// fileid will change when the user creates snapshots
// however, we hope that clients tolerate it well
// Linux does, even though it complains about "fileid changed" in dmesg
entries[subname].fileid = ic.first;
if (is_plus)
{
entries[subname].name_attributes = (post_op_attr){
.attributes_follow = 1,
.attributes = get_file_attributes(self, ic.first),
};
entries[subname].name_handle = (post_op_fh3){
.handle_follows = 1,
.handle = xdr_copy_string(rop->xdrs, "S"+base64_encode(sha256(inode_cfg.name))),
};
}
}
else
{
// skip directories, they will be added from dir_info
}
}
// Add directories from dir_info
for (auto dir_id_it = self->parent->dir_info.lower_bound(prefix);
dir_id_it != self->parent->dir_info.end(); dir_id_it++)
{
if (prefix != "" && dir_id_it->first.substr(0, prefix.size()) != prefix)
break;
if (dir_id_it->first.size() == prefix.size() ||
dir_id_it->first.find("/", prefix.size()) != std::string::npos)
continue;
std::string subname = dir_id_it->first.substr(prefix.size());
// for directories, fileid changes when the user restarts proxy
fill_dir_entry(self, rop, dir_id_it, &entries[subname], is_plus);
// Error, too small max reply size
auto cb = std::move(st->cb);
cb(-NFS3ERR_TOOSMALL);
return;
}
// Add . and ..
if (st->args.cookie <= 1)
{
auto dir_id_it = self->parent->dir_info.find(dir);
fill_dir_entry(self, rop, dir_id_it, &entries["."], is_plus);
auto sl = dir.rfind("/");
if (sl != std::string::npos)
kv_read_inode(st->self, kv_fh_inode(st->args.dir), [st](int res, const std::string & value, json11::Json ientry)
{
auto dir_id_it = self->parent->dir_info.find(dir.substr(0, sl));
fill_dir_entry(self, rop, dir_id_it, &entries[".."], is_plus);
st->res = res;
st->ientry_text = value;
st->ientry = ientry;
nfs_kv_continue_readdir(st, 1);
});
return;
resume_1:
if (st->res < 0)
{
auto cb = std::move(st->cb);
cb(st->res);
return;
}
if (st->args.cookie == 0)
{
auto fh = kv_fh(st->dir_ino);
auto entry_size = 20 + 4/*len_pad4(".")*/ + (is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
if (st->reply_size + entry_size > st->args.maxcount)
{
auto cb = std::move(st->cb);
cb(-NFS3ERR_TOOSMALL);
return;
}
entryplus3 dot;
dot.name = xdr_copy_string(st->rop->xdrs, ".");
dot.fileid = st->dir_ino;
dot.name_attributes = (post_op_attr){
.attributes_follow = 1,
.attributes = get_kv_attributes(self, st->dir_ino, st->ientry),
};
dot.name_handle = (post_op_fh3){
.handle_follows = 1,
.handle = xdr_copy_string(fh),
};
st->entries.push_back(dot);
st->reply_size += entry_size;
}
st->parent_ino = st->ientry["parent_ino"].uint64_value();
if (st->parent_ino)
{
kv_read_inode(st->self, st->ientry["parent_ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry)
{
st->res = res;
st->parent_ientry_text = value;
st->parent_ientry = ientry;
nfs_kv_continue_readdir(st, 2);
});
return;
resume_2:
if (st->res < 0)
{
auto cb = std::move(st->cb);
cb(st->res);
return;
}
auto fh = kv_fh(st->parent_ino);
auto entry_size = 20 + 4/*len_pad4("..")*/ + (is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
if (st->reply_size + entry_size > st->args.maxcount)
{
st->eof = false;
auto cb = std::move(st->cb);
cb(0);
return;
}
entryplus3 dotdot;
dotdot.name = xdr_copy_string(st->rop->xdrs, "..");
dotdot.fileid = st->dir_ino;
dotdot.name_attributes = (post_op_attr){
// FIXME: maybe do not read parent attributes and leave them to a GETATTR?
.attributes_follow = 1,
.attributes = get_kv_attributes(self, st->parent_ino, st->parent_ientry),
};
dotdot.name_handle = (post_op_fh3){
.handle_follows = 1,
.handle = xdr_copy_string(fh),
};
st->entries.push_back(dotdot);
st->reply_size += entry_size;
}
}
// Offset results by the continuation cookie (equal to index in the listing)
uint64_t idx = 1;
void *prev = NULL;
for (auto it = entries.begin(); it != entries.end();)
st->getattr_cur = st->entries.size();
st->list_handle = self->parent->db->list_start(prefix);
st->self->parent->db->list_next(st->list_handle, [=](int res, const std::string & key, const std::string & value)
{
entryplus3 *entry = &it->second;
// First fields of entry3 and entryplus3 are the same: fileid, name, cookie
entry->name = xdr_copy_string(rop->xdrs, it->first);
entry->cookie = idx++;
st->res = res;
st->cur_key = key;
st->cur_value = value;
nfs_kv_continue_readdir(st, 3);
});
return;
while (st->list_handle)
{
st->self->parent->db->list_next(st->list_handle, NULL);
return;
resume_3:
if (st->res == -ENOENT || st->key.size() > st->prefix.size() || st->key.substr(0, st->prefix.size()) != st->prefix)
{
self->parent->db->list_close(st->list_handle);
st->list_handle = NULL;
break;
}
auto direntry = json11::Json::parse(st->cur_value, err);
if (err != "")
{
fprintf(stderr, "readdir: direntry %s contains invalid JSON: %s, skipping\n",
st->cur_key.c_str(), st->cur_value.c_str());
continue;
}
auto ino = direntry["ino"].uint64_value();
auto name = kv_direntry_filename(st->cur_key);
auto fh = kv_fh(ino);
// 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
// 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
// + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0)
// + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes
auto entry_size = 20 + len_pad4(name) + (is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
if (st->reply_size + entry_size > st->args.maxcount)
{
st->eof = false;
self->parent->db->list_close(list_handle);
st->list_handle = NULL;
break;
}
st->reply_size += entry_size;
auto idx = st->entries.size();
st->entries.push_back((entryplus3){});
auto entry = &st->entries[idx];
entry->name = xdr_copy_string(st->rop->xdrs, name);
entry->fileid = ino;
if (st->is_plus)
{
entry->name_handle = (post_op_fh3){
.handle_follows = 1,
.handle = xdr_copy_string(fh),
};
kv_getattr_next(st);
}
st->self->parent->db->list_next(list_handle);
}
resume_4:
while (st->getattr_running > 0)
{
return;
}
void *prev = NULL;
for (int i = 0; i < st->entries.size(); i++)
{
entryplus3 *entry = &st->entries[i];
entry->cookie = st->offset + i;
if (prev)
{
if (is_plus)
@ -1658,60 +1763,6 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
((entry3*)prev)->nextentry = (entry3*)entry;
}
prev = entry;
if (args->cookie > 0 && entry->cookie == args->cookie)
entries.erase(entries.begin(), ++it);
else
it++;
}
// Now limit results based on maximum reply size
// Sadly we have to calculate reply size by hand
// reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes
int reply_size = 20;
if (reply_size > args->maxcount)
{
// Error, too small max reply size
if (is_plus)
{
READDIRPLUS3res *reply = (READDIRPLUS3res*)rop->reply;
*reply = (READDIRPLUS3res){ .status = NFS3ERR_TOOSMALL };
rpc_queue_reply(rop);
}
else
{
READDIR3res *reply = (READDIR3res*)rop->reply;
*reply = (READDIR3res){ .status = NFS3ERR_TOOSMALL };
rpc_queue_reply(rop);
}
return;
}
// 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
// 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
// + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0)
// + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes
bool eof = true;
for (auto it = entries.begin(); it != entries.end(); it++)
{
reply_size += 20+len_pad4(it->first.size())+(is_plus
? 8+88+len_pad4(it->second.name_handle.handle.size) : 0);
if (reply_size > args->maxcount)
{
// Stop
entries.erase(it, entries.end());
eof = false;
break;
}
}
if (entries.end() != entries.begin())
{
auto last_it = entries.end();
last_it--;
if (is_plus)
((entryplus3*)&last_it->second)->nextentry = NULL;
else
{
entry3* e = ((entry3*)&last_it->second);
e->nextentry = NULL;
}
}
// Send reply
if (is_plus)
@ -1733,6 +1784,48 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
rpc_queue_reply(rop);
}
static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
{
auto st = new nfs_kv_readdir_state;
st->self = (nfs_client_t*)opaque;
st->rop = rop;
st->is_plus = is_plus;
if (st->is_plus)
st->args = *((READDIRPLUS3args*)rop->request);
else
{
READDIR3args *in_args = ((READDIR3args*)rop->request);
st->args.dir = in_args->dir;
st->args.cookie = in_args->cookie;
*((uint64_t*)st->args.cookieverf) = *((uint64_t*)in_args->cookieverf);
st->args.dircount = 512;
st->args.maxcount = in_args->count;
}
st->dir_ino = kv_fh_inode(st->args.dir);
st->cb = [st](int res)
{
if (st->is_plus)
{
READDIRPLUS3res *reply = (READDIRPLUS3res*)rop->reply;
if (res < 0)
*reply = (READDIRPLUS3res){ .status = vitastor_nfs_map_err(res) };
else
reply->status = NFS3_OK;
}
else
{
READDIR3res *reply = (READDIR3res*)rop->reply;
if (res < 0)
*reply = (READDIR3res){ .status = vitastor_nfs_map_err(res) };
else
reply->status = NFS3_OK;
}
rpc_queue_reply(rop);
delete st;
};
nfs_kv_continue_readdir(st, 0);
}
static int nfs3_readdir_proc(void *opaque, rpc_op_t *rop)
{
nfs3_readdir_common(opaque, rop, false);