Store pool ID in inode metadata

antietcd
Vitaliy Filippov 2024-03-10 16:04:56 +03:00
parent 0bde28c24a
commit dcbe1afac3
10 changed files with 73 additions and 51 deletions

View File

@ -118,41 +118,41 @@ std::string kv_direntry_filename(const std::string & key)
std::string kv_inode_key(uint64_t ino) std::string kv_inode_key(uint64_t ino)
{ {
char key[24] = { 0 }; char key[32] = { 0 };
snprintf(key, sizeof(key), "i-%jx", ino); snprintf(key, sizeof(key), "i%x", INODE_POOL(ino));
int n = strnlen(key, sizeof(key)-1) - 2; int n = strnlen(key, sizeof(key)-1);
if (n < 10) snprintf(key+n+1, sizeof(key)-n-1, "%jx", INODE_NO_POOL(ino));
key[1] = '0'+n; int m = strnlen(key+n+1, sizeof(key)-n-2);
else key[n] = 'G'+m;
key[1] = 'A'+(n-10); return std::string(key);
return std::string(key, n+2);
} }
std::string kv_fh(uint64_t ino) std::string kv_fh(uint64_t ino)
{ {
return "S"+std::string((char*)&ino, 8); char key[32] = { 0 };
snprintf(key, sizeof(key), "S%jx", ino);
return key;
} }
uint64_t kv_fh_inode(const std::string & fh) uint64_t kv_fh_inode(const std::string & fh)
{ {
if (fh.size() == 1 && fh[0] == 'R') if (fh == NFS_ROOT_HANDLE)
{ {
return 1; return 1;
} }
else if (fh.size() == 9 && fh[0] == 'S') else if (fh[0] == 'S')
{ {
return *(uint64_t*)&fh[1]; uint64_t ino = 0;
} int r = sscanf(fh.c_str()+1, "%jx", &ino);
else if (fh.size() > 17 && fh[0] == 'I') if (r == 1)
{ return ino;
return *(uint64_t*)&fh[fh.size()-8];
} }
return 0; return 0;
} }
bool kv_fh_valid(const std::string & fh) bool kv_fh_valid(const std::string & fh)
{ {
return fh == NFS_ROOT_HANDLE || fh.size() == 9 && fh[0] == 'S' || fh.size() > 17 && fh[0] == 'I'; return fh == NFS_ROOT_HANDLE || fh[0] == 'S';
} }
void nfs_kv_procs(nfs_client_t *self) void nfs_kv_procs(nfs_client_t *self)
@ -255,7 +255,6 @@ void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
strerror(-open_res), open_res); strerror(-open_res), open_res);
exit(1); exit(1);
} }
fs_base_inode = ((uint64_t)proxy->default_pool_id << (64-POOL_ID_BITS));
fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1; fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1;
shared_inode_threshold = pool_block_size; shared_inode_threshold = pool_block_size;
if (!cfg["shared_inode_threshold"].is_null()) if (!cfg["shared_inode_threshold"].is_null())

View File

@ -8,7 +8,6 @@
#include "nfs/nfs.h" #include "nfs/nfs.h"
#define KV_ROOT_INODE 1 #define KV_ROOT_INODE 1
#define KV_NEXT_ID_KEY "id"
#define SHARED_FILE_MAGIC_V1 0x711A5158A6EDF17E #define SHARED_FILE_MAGIC_V1 0x711A5158A6EDF17E
struct nfs_kv_write_state; struct nfs_kv_write_state;
@ -42,13 +41,18 @@ struct kv_inode_extend_t
std::vector<std::function<void()>> waiters; std::vector<std::function<void()>> waiters;
}; };
struct kv_idgen_t
{
uint64_t next_id = 1, allocated_id = 0;
std::vector<uint64_t> unallocated_ids;
};
struct kv_fs_state_t struct kv_fs_state_t
{ {
nfs_proxy_t *proxy = NULL; nfs_proxy_t *proxy = NULL;
int touch_timer_id = -1; int touch_timer_id = -1;
uint64_t fs_kv_inode = 0; uint64_t fs_kv_inode = 0;
uint64_t fs_base_inode = 0;
uint64_t fs_inode_count = 0; uint64_t fs_inode_count = 0;
int readdir_getattr_parallel = 8, id_alloc_batch_size = 200; int readdir_getattr_parallel = 8, id_alloc_batch_size = 200;
uint64_t pool_block_size = 0; uint64_t pool_block_size = 0;
@ -57,8 +61,7 @@ struct kv_fs_state_t
uint64_t touch_interval = 1000; uint64_t touch_interval = 1000;
std::map<list_cookie_t, list_cookie_val_t> list_cookies; std::map<list_cookie_t, list_cookie_val_t> list_cookies;
uint64_t fs_next_id = 1, fs_allocated_id = 0; std::map<pool_id_t, kv_idgen_t> idgen;
std::vector<uint64_t> unallocated_ids;
std::vector<shared_alloc_queue_t> allocating_shared; std::vector<shared_alloc_queue_t> allocating_shared;
uint64_t cur_shared_inode = 0, cur_shared_offset = 0; uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
std::map<inode_t, kv_inode_extend_t> extends; std::map<inode_t, kv_inode_extend_t> extends;
@ -105,7 +108,7 @@ std::string kv_inode_key(uint64_t ino);
std::string kv_fh(uint64_t ino); std::string kv_fh(uint64_t ino);
uint64_t kv_fh_inode(const std::string & fh); uint64_t kv_fh_inode(const std::string & fh);
bool kv_fh_valid(const std::string & fh); bool kv_fh_valid(const std::string & fh);
void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t new_id)> cb); void allocate_new_id(nfs_client_t *self, pool_id_t pool_id, std::function<void(int res, uint64_t new_id)> cb);
void kv_read_inode(nfs_proxy_t *proxy, uint64_t ino, void kv_read_inode(nfs_proxy_t *proxy, uint64_t ino,
std::function<void(int res, const std::string & value, json11::Json ientry)> cb, std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
bool allow_cache = false); bool allow_cache = false);

View File

@ -9,19 +9,30 @@
#include "nfs_proxy.h" #include "nfs_proxy.h"
#include "nfs_kv.h" #include "nfs_kv.h"
void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t new_id)> cb) void allocate_new_id(nfs_client_t *self, pool_id_t pool_id, std::function<void(int res, uint64_t new_id)> cb)
{ {
if (self->parent->kvfs->fs_next_id <= self->parent->kvfs->fs_allocated_id) auto & idgen = self->parent->kvfs->idgen[pool_id];
if (idgen.unallocated_ids.size())
{ {
cb(0, self->parent->kvfs->fs_next_id++); auto new_id = idgen.unallocated_ids.back();
idgen.unallocated_ids.pop_back();
cb(0, INODE_WITH_POOL(pool_id, new_id));
return; return;
} }
else if (self->parent->kvfs->fs_next_id > self->parent->kvfs->fs_inode_count) else if (idgen.next_id <= idgen.allocated_id)
{
idgen.next_id++;
cb(0, INODE_WITH_POOL(pool_id, idgen.next_id-1));
return;
}
// FIXME: Partial per-pool max ID limits
// FIXME: Fool protection from block volume and FS file ID overlap
else if (idgen.next_id >= ((uint64_t)1 << (64-POOL_ID_BITS)))
{ {
cb(-ENOSPC, 0); cb(-ENOSPC, 0);
return; return;
} }
self->parent->db->get(KV_NEXT_ID_KEY, [=](int res, const std::string & prev_str) self->parent->db->get((pool_id ? "id"+std::to_string(pool_id) : "id"), [=](int res, const std::string & prev_str)
{ {
if (res < 0 && res != -ENOENT) if (res < 0 && res != -ENOENT)
{ {
@ -29,7 +40,7 @@ void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t ne
return; return;
} }
uint64_t prev_val = stoull_full(prev_str); uint64_t prev_val = stoull_full(prev_str);
if (prev_val >= self->parent->kvfs->fs_inode_count) if (prev_val >= ((uint64_t)1 << (64-POOL_ID_BITS)))
{ {
cb(-ENOSPC, 0); cb(-ENOSPC, 0);
return; return;
@ -43,12 +54,12 @@ void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t ne
{ {
new_val = self->parent->kvfs->fs_inode_count; new_val = self->parent->kvfs->fs_inode_count;
} }
self->parent->db->set(KV_NEXT_ID_KEY, std::to_string(new_val), [=](int res) self->parent->db->set((pool_id ? "id"+std::to_string(pool_id) : "id"), std::to_string(new_val), [=](int res)
{ {
if (res == -EAGAIN) if (res == -EAGAIN)
{ {
// CAS failure - retry // CAS failure - retry
allocate_new_id(self, cb); allocate_new_id(self, pool_id, cb);
} }
else if (res < 0) else if (res < 0)
{ {
@ -56,9 +67,10 @@ void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t ne
} }
else else
{ {
self->parent->kvfs->fs_next_id = prev_val+2; auto & idgen = self->parent->kvfs->idgen[pool_id];
self->parent->kvfs->fs_allocated_id = new_val; idgen.next_id = prev_val+2;
cb(0, prev_val+1); idgen.allocated_id = new_val;
cb(0, INODE_WITH_POOL(pool_id, prev_val+1));
} }
}, [prev_val](int res, const std::string & value) }, [prev_val](int res, const std::string & value)
{ {
@ -76,7 +88,9 @@ struct kv_create_state
uint64_t verf = 0; uint64_t verf = 0;
uint64_t dir_ino = 0; uint64_t dir_ino = 0;
std::string filename; std::string filename;
// state
int res = 0; int res = 0;
pool_id_t pool_id = 0;
uint64_t new_id = 0; uint64_t new_id = 0;
json11::Json::object attrobj; json11::Json::object attrobj;
json11::Json attrs; json11::Json attrs;
@ -107,7 +121,11 @@ static void kv_continue_create(kv_create_state *st, int state)
st->attrs = std::move(st->attrobj); st->attrs = std::move(st->attrobj);
resume_1: resume_1:
// Generate inode ID // Generate inode ID
allocate_new_id(st->self, [st](int res, uint64_t new_id) // Directories and special files don't need pool
st->pool_id = kv_map_type(st->attrs["type"].string_value()) == NF3REG
? st->self->parent->default_pool_id
: 0;
allocate_new_id(st->self, st->pool_id, [st](int res, uint64_t new_id)
{ {
st->res = res; st->res = res;
st->new_id = new_id; st->new_id = new_id;
@ -195,7 +213,8 @@ resume_5:
} }
else else
{ {
st->self->parent->kvfs->unallocated_ids.push_back(st->new_id); auto & idgen = st->self->parent->kvfs->idgen[INODE_POOL(st->new_id)];
idgen.unallocated_ids.push_back(INODE_NO_POOL(st->new_id));
} }
if (st->dup_ino) if (st->dup_ino)
{ {

View File

@ -70,7 +70,7 @@ resume_1:
st->op = new cluster_op_t; st->op = new cluster_op_t;
{ {
st->op->opcode = OSD_OP_READ; st->op->opcode = OSD_OP_READ;
st->op->inode = st->self->parent->kvfs->fs_base_inode + st->ientry["shared_ino"].uint64_value(); st->op->inode = st->ientry["shared_ino"].uint64_value();
// Always read including header to react if the file was possibly moved away // Always read including header to react if the file was possibly moved away
auto read_offset = st->ientry["shared_offset"].uint64_value(); auto read_offset = st->ientry["shared_offset"].uint64_value();
st->op->offset = align_down(read_offset); st->op->offset = align_down(read_offset);
@ -136,7 +136,7 @@ resume_2:
st->buf = st->aligned_buf + st->offset - st->aligned_offset; st->buf = st->aligned_buf + st->offset - st->aligned_offset;
st->op = new cluster_op_t; st->op = new cluster_op_t;
st->op->opcode = OSD_OP_READ; st->op->opcode = OSD_OP_READ;
st->op->inode = st->self->parent->kvfs->fs_base_inode + st->ino; st->op->inode = st->ino;
st->op->offset = st->aligned_offset; st->op->offset = st->aligned_offset;
st->op->len = st->aligned_size; st->op->len = st->aligned_size;
st->op->iov.push_back(st->aligned_buf, st->aligned_size); st->op->iov.push_back(st->aligned_buf, st->aligned_size);

View File

@ -233,8 +233,8 @@ resume_6:
{ {
// Remove data // Remove data
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
{ "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, { "inode", INODE_NO_POOL(st->ino) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, { "pool", (uint64_t)INODE_POOL(st->ino) },
}), [st](const cli_result_t & r) }), [st](const cli_result_t & r)
{ {
if (r.err) if (r.err)

View File

@ -280,8 +280,8 @@ resume_8:
if (st->rm_dest_data) if (st->rm_dest_data)
{ {
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
{ "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->new_direntry["ino"].uint64_value()) }, { "inode", INODE_NO_POOL(st->new_direntry["ino"].uint64_value()) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->new_direntry["ino"].uint64_value()) }, { "pool", (uint64_t)INODE_POOL(st->new_direntry["ino"].uint64_value()) },
}), [st](const cli_result_t & r) }), [st](const cli_result_t & r)
{ {
if (r.err) if (r.err)

View File

@ -118,8 +118,8 @@ resume_2:
{ {
// Delete extra data when downsizing // Delete extra data when downsizing
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object { st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
{ "inode", INODE_NO_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, { "inode", INODE_NO_POOL(st->ino) },
{ "pool", (uint64_t)INODE_POOL(st->self->parent->kvfs->fs_base_inode + st->ino) }, { "pool", (uint64_t)INODE_POOL(st->ino) },
{ "min_offset", st->set_attrs["size"].uint64_value() }, { "min_offset", st->set_attrs["size"].uint64_value() },
}), [st](const cli_result_t & r) }), [st](const cli_result_t & r)
{ {

View File

@ -95,7 +95,7 @@ static void allocate_shared_inode(nfs_kv_write_state *st, int state)
{ {
return; return;
} }
allocate_new_id(st->self, [st](int res, uint64_t new_id) allocate_new_id(st->self, st->self->parent->default_pool_id, [st](int res, uint64_t new_id)
{ {
if (res < 0) if (res < 0)
{ {
@ -133,7 +133,7 @@ static void nfs_do_write(uint64_t ino, uint64_t offset, uint64_t size, std::func
{ {
auto op = new cluster_op_t; auto op = new cluster_op_t;
op->opcode = OSD_OP_WRITE; op->opcode = OSD_OP_WRITE;
op->inode = st->self->parent->kvfs->fs_base_inode + ino; op->inode = ino;
op->offset = offset; op->offset = offset;
op->len = size; op->len = size;
prepare(op); prepare(op);
@ -178,7 +178,7 @@ void nfs_do_rmw(nfs_rmw_t *rmw)
} }
auto op = new cluster_op_t; auto op = new cluster_op_t;
op->opcode = OSD_OP_READ; op->opcode = OSD_OP_READ;
op->inode = parent->kvfs->fs_base_inode + rmw->ino; op->inode = rmw->ino;
op->offset = rmw->offset & ~(align-1); op->offset = rmw->offset & ~(align-1);
op->len = align; op->len = align;
op->iov.push_back(rmw->part_buf, op->len); op->iov.push_back(rmw->part_buf, op->len);
@ -209,7 +209,7 @@ void nfs_do_rmw(nfs_rmw_t *rmw)
bool is_end = ((rmw->offset+rmw->size) % align); bool is_end = ((rmw->offset+rmw->size) % align);
auto op = new cluster_op_t; auto op = new cluster_op_t;
op->opcode = OSD_OP_WRITE; op->opcode = OSD_OP_WRITE;
op->inode = parent->kvfs->fs_base_inode + rmw->ino; op->inode = rmw->ino;
op->offset = rmw->offset & ~(align-1); op->offset = rmw->offset & ~(align-1);
op->len = align; op->len = align;
op->version = rmw->version; op->version = rmw->version;
@ -263,7 +263,7 @@ static void nfs_do_shared_read(nfs_kv_write_state *st, int state)
uint64_t shared_offset = st->ientry["shared_offset"].uint64_value(); uint64_t shared_offset = st->ientry["shared_offset"].uint64_value();
auto op = new cluster_op_t; auto op = new cluster_op_t;
op->opcode = OSD_OP_READ; op->opcode = OSD_OP_READ;
op->inode = st->self->parent->kvfs->fs_base_inode + st->ientry["shared_ino"].uint64_value(); op->inode = st->ientry["shared_ino"].uint64_value();
op->offset = align_down(shared_offset); op->offset = align_down(shared_offset);
// Allow unaligned shared reads // Allow unaligned shared reads
auto pre = shared_offset-align_down(shared_offset); auto pre = shared_offset-align_down(shared_offset);
@ -759,7 +759,7 @@ resume_1:
cb(st->res == 0 ? -EINVAL : st->res); cb(st->res == 0 ? -EINVAL : st->res);
return; return;
} }
st->was_immediate = st->self->parent->cli->get_immediate_commit(st->self->parent->kvfs->fs_base_inode + st->ino); st->was_immediate = st->self->parent->cli->get_immediate_commit(st->ino);
st->new_size = st->ientry["size"].uint64_value(); st->new_size = st->ientry["size"].uint64_value();
if (st->new_size < st->offset + st->size) if (st->new_size < st->offset + st->size)
{ {

View File

@ -23,6 +23,7 @@ public:
std::string bind_address; std::string bind_address;
uint64_t fsid = 1; uint64_t fsid = 1;
uint64_t server_id = 0; uint64_t server_id = 0;
// FIXME: Maybe allow to create files in different pools?
std::string default_pool; std::string default_pool;
std::string export_root; std::string export_root;
bool portmap_enabled; bool portmap_enabled;

View File

@ -11,7 +11,7 @@
#define POOL_ID_MAX 0x10000 #define POOL_ID_MAX 0x10000
#define POOL_ID_BITS 16 #define POOL_ID_BITS 16
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS)) #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
#define INODE_NO_POOL(inode) (inode_t)(inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) #define INODE_NO_POOL(inode) (inode_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode)) #define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
// Pool ID is 16 bits long // Pool ID is 16 bits long