Implement packing small files into shared inodes
parent
181795d748
commit
e5bb986164
|
@ -13,57 +13,155 @@
|
|||
|
||||
#include "cli.h"
|
||||
|
||||
struct nfs_kv_read_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
bool allow_cache = true;
|
||||
inode_t ino = 0;
|
||||
uint64_t offset = 0, size = 0;
|
||||
std::function<void(int)> cb;
|
||||
// state
|
||||
int res = 0;
|
||||
json11::Json ientry;
|
||||
uint64_t aligned_size = 0, aligned_offset = 0;
|
||||
uint8_t *aligned_buf = NULL;
|
||||
cluster_op_t *op = NULL;
|
||||
uint8_t *buf = NULL;
|
||||
};
|
||||
|
||||
static void nfs_kv_continue_read(nfs_kv_read_state *st, int state)
|
||||
{
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_read()");
|
||||
abort();
|
||||
}
|
||||
if (st->offset + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold)
|
||||
{
|
||||
kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
st->res = res;
|
||||
st->ientry = attrs;
|
||||
nfs_kv_continue_read(st, 1);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_1:
|
||||
if (st->res < 0 || kv_map_type(st->ientry["type"].string_value()) != NF3REG)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res < 0 ? st->res : -EINVAL);
|
||||
return;
|
||||
}
|
||||
if (st->ientry["shared_ino"].uint64_value() != 0)
|
||||
{
|
||||
st->aligned_size = align_shared_size(st->self, st->offset+st->size);
|
||||
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
|
||||
st->buf = st->aligned_buf + sizeof(shared_file_header_t) + st->offset;
|
||||
st->op = new cluster_op_t;
|
||||
st->op->opcode = OSD_OP_READ;
|
||||
st->op->inode = st->self->parent->fs_base_inode + st->ientry["shared_ino"].uint64_value();
|
||||
st->op->offset = st->ientry["shared_offset"].uint64_value();
|
||||
if (st->offset+st->size > st->ientry["size"].uint64_value())
|
||||
{
|
||||
st->op->len = align_shared_size(st->self, st->ientry["size"].uint64_value());
|
||||
memset(st->aligned_buf+st->op->len, 0, st->aligned_size-st->op->len);
|
||||
}
|
||||
else
|
||||
st->op->len = st->aligned_size;
|
||||
st->op->iov.push_back(st->aligned_buf, st->op->len);
|
||||
st->op->callback = [st, state](cluster_op_t *op)
|
||||
{
|
||||
st->res = op->retval == op->len ? 0 : op->retval;
|
||||
delete op;
|
||||
nfs_kv_continue_read(st, 2);
|
||||
};
|
||||
st->self->parent->cli->execute(st->op);
|
||||
return;
|
||||
resume_2:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
auto hdr = ((shared_file_header_t*)st->aligned_buf);
|
||||
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino ||
|
||||
align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value()))
|
||||
{
|
||||
// Got unrelated data - retry from the beginning
|
||||
free(st->aligned_buf);
|
||||
st->aligned_buf = NULL;
|
||||
st->allow_cache = false;
|
||||
nfs_kv_continue_read(st, 0);
|
||||
return;
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1));
|
||||
st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment) &
|
||||
~(st->self->parent->pool_alignment-1)) - st->aligned_offset;
|
||||
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
|
||||
st->buf = st->aligned_buf + st->offset - st->aligned_offset;
|
||||
st->op = new cluster_op_t;
|
||||
st->op->opcode = OSD_OP_READ;
|
||||
st->op->inode = st->self->parent->fs_base_inode + st->ino;
|
||||
st->op->offset = st->aligned_offset;
|
||||
st->op->len = st->aligned_size;
|
||||
st->op->iov.push_back(st->aligned_buf, st->aligned_size);
|
||||
st->op->callback = [st](cluster_op_t *op)
|
||||
{
|
||||
st->res = op->retval;
|
||||
delete op;
|
||||
nfs_kv_continue_read(st, 3);
|
||||
};
|
||||
st->self->parent->cli->execute(st->op);
|
||||
return;
|
||||
resume_3:
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
|
||||
int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
READ3args *args = (READ3args*)rop->request;
|
||||
READ3res *reply = (READ3res*)rop->reply;
|
||||
inode_t ino = kv_fh_inode(args->file);
|
||||
auto ino = kv_fh_inode(args->file);
|
||||
if (args->count > MAX_REQUEST_SIZE || !ino)
|
||||
{
|
||||
*reply = (READ3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
uint64_t alignment = self->parent->cli->st_cli.global_bitmap_granularity;
|
||||
auto pool_cfg = self->parent->cli->st_cli.pool_config.find(INODE_POOL(self->parent->fs_base_inode));
|
||||
if (pool_cfg != self->parent->cli->st_cli.pool_config.end())
|
||||
auto st = new nfs_kv_read_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
st->ino = ino;
|
||||
st->offset = args->offset;
|
||||
st->size = args->count;
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
alignment = pool_cfg->second.bitmap_granularity;
|
||||
}
|
||||
uint64_t aligned_offset = args->offset - (args->offset % alignment);
|
||||
uint64_t aligned_count = args->offset + args->count;
|
||||
if (aligned_count % alignment)
|
||||
aligned_count = aligned_count + alignment - (aligned_count % alignment);
|
||||
aligned_count -= aligned_offset;
|
||||
void *buf = malloc_or_die(aligned_count);
|
||||
xdr_add_malloc(rop->xdrs, buf);
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->opcode = OSD_OP_READ;
|
||||
op->inode = self->parent->fs_base_inode + ino;
|
||||
op->offset = aligned_offset;
|
||||
op->len = aligned_count;
|
||||
op->iov.push_back(buf, aligned_count);
|
||||
*reply = (READ3res){ .status = NFS3_OK };
|
||||
reply->resok.data.data = (char*)buf + args->offset - aligned_offset;
|
||||
reply->resok.data.size = args->count;
|
||||
op->callback = [rop](cluster_op_t *op)
|
||||
{
|
||||
READ3res *reply = (READ3res*)rop->reply;
|
||||
if (op->retval != op->len)
|
||||
READ3res *reply = (READ3res*)st->rop->reply;
|
||||
*reply = (READ3res){ .status = vitastor_nfs_map_err(res) };
|
||||
if (res == 0)
|
||||
{
|
||||
*reply = (READ3res){ .status = vitastor_nfs_map_err(-op->retval) };
|
||||
xdr_add_malloc(st->rop->xdrs, st->aligned_buf);
|
||||
reply->resok.data.data = (char*)st->buf;
|
||||
reply->resok.data.size = st->size;
|
||||
reply->resok.count = st->size;
|
||||
reply->resok.eof = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & reply_ok = reply->resok;
|
||||
// reply_ok.data.data is already set above
|
||||
reply_ok.count = reply_ok.data.size;
|
||||
reply_ok.eof = 0;
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
delete op;
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
self->parent->cli->execute(op);
|
||||
nfs_kv_continue_read(st, 0);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -228,7 +228,8 @@ resume_6:
|
|||
return;
|
||||
}
|
||||
// (6) If regular file and inode is deleted: delete data
|
||||
if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1)
|
||||
if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1 &&
|
||||
!st->ientry["shared_inode"].uint64_value())
|
||||
{
|
||||
// Remove data
|
||||
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -191,6 +191,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
}
|
||||
fs_base_inode = ((uint64_t)default_pool_id << (64-POOL_ID_BITS));
|
||||
fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1;
|
||||
shared_inode_threshold = pool_block_size;
|
||||
}
|
||||
// Self-register portmap and NFS
|
||||
pmap.reg_ports.insert((portmap_id_t){
|
||||
|
@ -372,8 +373,11 @@ void nfs_proxy_t::check_default_pool()
|
|||
{
|
||||
if (cli->st_cli.pool_config.size() == 1)
|
||||
{
|
||||
default_pool = cli->st_cli.pool_config.begin()->second.name;
|
||||
default_pool_id = cli->st_cli.pool_config.begin()->first;
|
||||
auto pool_it = cli->st_cli.pool_config.begin();
|
||||
default_pool_id = pool_it->first;
|
||||
default_pool = pool_it->second.name;
|
||||
pool_block_size = pool_it->second.pg_stripe_size;
|
||||
pool_alignment = pool_it->second.bitmap_granularity;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -388,6 +392,8 @@ void nfs_proxy_t::check_default_pool()
|
|||
if (p.second.name == default_pool)
|
||||
{
|
||||
default_pool_id = p.first;
|
||||
pool_block_size = p.second.pg_stripe_size;
|
||||
pool_alignment = p.second.bitmap_granularity;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,6 +29,22 @@ struct list_cookie_val_t
|
|||
std::string key;
|
||||
};
|
||||
|
||||
struct nfs_kv_write_state;
|
||||
|
||||
struct shared_alloc_queue_t
|
||||
{
|
||||
nfs_kv_write_state *st;
|
||||
int state;
|
||||
uint64_t size;
|
||||
};
|
||||
|
||||
struct inode_extend_t
|
||||
{
|
||||
int refcnt = 0;
|
||||
uint64_t cur_extend = 0, next_extend = 0, done_extend = 0;
|
||||
std::vector<std::function<void()>> waiters;
|
||||
};
|
||||
|
||||
class nfs_proxy_t
|
||||
{
|
||||
public:
|
||||
|
@ -47,6 +63,9 @@ public:
|
|||
int trace = 0;
|
||||
|
||||
pool_id_t default_pool_id;
|
||||
uint64_t pool_block_size = 0;
|
||||
uint64_t pool_alignment = 0;
|
||||
uint64_t shared_inode_threshold = 0;
|
||||
|
||||
portmap_service_t pmap;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
|
@ -57,6 +76,9 @@ public:
|
|||
std::map<list_cookie_t, list_cookie_val_t> list_cookies;
|
||||
uint64_t fs_next_id = 0, fs_allocated_id = 0;
|
||||
std::vector<uint64_t> unallocated_ids;
|
||||
std::vector<shared_alloc_queue_t> allocating_shared;
|
||||
uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
|
||||
std::map<inode_t, inode_extend_t> extends;
|
||||
|
||||
std::vector<XDR*> xdr_pool;
|
||||
|
||||
|
@ -76,6 +98,7 @@ public:
|
|||
void daemonize();
|
||||
};
|
||||
|
||||
// FIXME: Move to "proto"
|
||||
struct rpc_cur_buffer_t
|
||||
{
|
||||
uint8_t *buf;
|
||||
|
@ -97,30 +120,6 @@ struct rpc_free_buffer_t
|
|||
unsigned size;
|
||||
};
|
||||
|
||||
struct extend_size_t
|
||||
{
|
||||
inode_t inode;
|
||||
uint64_t new_size;
|
||||
};
|
||||
|
||||
inline bool operator < (const extend_size_t &a, const extend_size_t &b)
|
||||
{
|
||||
return a.inode < b.inode || a.inode == b.inode && a.new_size < b.new_size;
|
||||
}
|
||||
|
||||
struct extend_write_t
|
||||
{
|
||||
rpc_op_t *rop;
|
||||
int resize_res, write_res; // 1 = started, 0 = completed OK, -errno = completed with error
|
||||
};
|
||||
|
||||
struct extend_inode_t
|
||||
{
|
||||
uint64_t cur_extend = 0, next_extend = 0;
|
||||
std::string old_ientry;
|
||||
json11::Json::object attrs;
|
||||
};
|
||||
|
||||
class nfs_client_t
|
||||
{
|
||||
public:
|
||||
|
@ -135,8 +134,6 @@ public:
|
|||
rpc_cur_buffer_t cur_buffer = { 0 };
|
||||
std::map<uint8_t*, rpc_used_buffer_t> used_buffers;
|
||||
std::vector<rpc_free_buffer_t> free_buffers;
|
||||
std::map<inode_t, extend_inode_t> extends;
|
||||
std::multimap<extend_size_t, extend_write_t> extend_writes;
|
||||
|
||||
iovec read_iov;
|
||||
msghdr read_msg = { 0 };
|
||||
|
@ -166,6 +163,14 @@ public:
|
|||
#define KV_ROOT_INODE 1
|
||||
#define KV_NEXT_ID_KEY "id"
|
||||
#define KV_ROOT_HANDLE "R"
|
||||
#define SHARED_FILE_MAGIC_V1 0x711A5158A6EDF17E
|
||||
|
||||
struct shared_file_header_t
|
||||
{
|
||||
uint64_t magic = 0;
|
||||
uint64_t inode = 0;
|
||||
uint64_t size = 0;
|
||||
};
|
||||
|
||||
nfsstat3 vitastor_nfs_map_err(int err);
|
||||
nfstime3 nfstime_from_str(const std::string & s);
|
||||
|
@ -182,6 +187,7 @@ void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t ne
|
|||
void kv_read_inode(nfs_client_t *self, uint64_t ino,
|
||||
std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
|
||||
bool allow_cache = false);
|
||||
uint64_t align_shared_size(nfs_client_t *self, uint64_t size);
|
||||
|
||||
int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop);
|
||||
|
|
Loading…
Reference in New Issue