Fix NFS shared/aligned write FIXMEs

node-binding
Vitaliy Filippov 2024-03-02 01:07:30 +03:00
parent 3aee37eadd
commit 6213fbd8c6
5 changed files with 110 additions and 51 deletions

View File

@ -51,6 +51,7 @@ struct kv_fs_state_t
std::vector<shared_alloc_queue_t> allocating_shared; std::vector<shared_alloc_queue_t> allocating_shared;
uint64_t cur_shared_inode = 0, cur_shared_offset = 0; uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
std::map<inode_t, kv_inode_extend_t> extends; std::map<inode_t, kv_inode_extend_t> extends;
std::vector<uint8_t> zero_block;
}; };
struct shared_file_header_t struct shared_file_header_t

View File

@ -142,7 +142,6 @@ resume_4:
cb(st->res); cb(st->res);
} }
// FIXME: We'll need some tests for the FS
int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop) int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop)
{ {
auto st = new nfs_kv_link_state; auto st = new nfs_kv_link_state;

View File

@ -41,6 +41,7 @@ struct nfs_kv_write_state
uint64_t shared_inode = 0, shared_offset = 0; uint64_t shared_inode = 0, shared_offset = 0;
bool was_immediate = false; bool was_immediate = false;
nfs_rmw_t rmw[2]; nfs_rmw_t rmw[2];
shared_file_header_t shdr;
kv_inode_extend_t *ext = NULL; kv_inode_extend_t *ext = NULL;
~nfs_kv_write_state() ~nfs_kv_write_state()
@ -123,14 +124,14 @@ uint64_t align_shared_size(nfs_client_t *self, uint64_t size)
& ~(self->parent->pool_alignment-1); & ~(self->parent->pool_alignment-1);
} }
static void nfs_do_write(uint64_t ino, uint64_t offset, uint8_t *buf, uint64_t size, nfs_kv_write_state *st, int state) static void nfs_do_write(uint64_t ino, uint64_t offset, uint64_t size, std::function<void(cluster_op_t *op)> prepare, nfs_kv_write_state *st, int state)
{ {
auto op = new cluster_op_t; auto op = new cluster_op_t;
op->opcode = OSD_OP_WRITE; op->opcode = OSD_OP_WRITE;
op->inode = st->self->parent->fs_base_inode + ino; op->inode = st->self->parent->fs_base_inode + ino;
op->offset = offset; op->offset = offset;
op->len = size; op->len = size;
op->iov.push_back(buf, size); prepare(op);
st->waiting++; st->waiting++;
op->callback = [st, state](cluster_op_t *op) op->callback = [st, state](cluster_op_t *op)
{ {
@ -148,15 +149,12 @@ static void nfs_do_write(uint64_t ino, uint64_t offset, uint8_t *buf, uint64_t s
st->self->parent->cli->execute(op); st->self->parent->cli->execute(op);
} }
static void nfs_do_shared_write(nfs_kv_write_state *st, int state)
{
nfs_do_write(st->shared_inode, st->shared_offset, st->aligned_buf, st->aligned_size, st, state);
}
static void nfs_do_unshare_write(nfs_kv_write_state *st, int state) static void nfs_do_unshare_write(nfs_kv_write_state *st, int state)
{ {
nfs_do_write(st->ino, 0, st->aligned_buf + sizeof(shared_file_header_t), nfs_do_write(st->ino, 0, st->aligned_size - sizeof(shared_file_header_t), [&](cluster_op_t *op)
st->aligned_size - sizeof(shared_file_header_t), st, state); {
op->iov.push_back(st->aligned_buf + sizeof(shared_file_header_t), st->aligned_size - sizeof(shared_file_header_t));
}, st, state);
} }
static void nfs_do_rmw(nfs_rmw_t *rmw) static void nfs_do_rmw(nfs_rmw_t *rmw)
@ -295,6 +293,8 @@ static bool nfs_do_shared_readmodify(nfs_kv_write_state *st, int base_state, int
: align_shared_size(st->self, st->new_size); : align_shared_size(st->self, st->new_size);
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size); st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
memset(st->aligned_buf + sizeof(shared_file_header_t), 0, st->offset); memset(st->aligned_buf + sizeof(shared_file_header_t), 0, st->offset);
memset(st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size, 0,
st->aligned_size - sizeof(shared_file_header_t) - st->offset - st->size);
if (st->ientry["shared_ino"].uint64_value() != 0 && if (st->ientry["shared_ino"].uint64_value() != 0 &&
st->ientry["size"].uint64_value() != 0) st->ientry["size"].uint64_value() != 0)
{ {
@ -324,71 +324,116 @@ resume_0:
.inode = st->ino, .inode = st->ino,
.alloc = st->aligned_size, .alloc = st->aligned_size,
}; };
memcpy(st->aligned_buf + sizeof(shared_file_header_t) + st->offset, st->buf, st->size);
memset(st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size, 0,
st->aligned_size - sizeof(shared_file_header_t) - st->offset - st->size);
return true; return true;
} }
static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t offset, int state) static void nfs_do_shared_write(nfs_kv_write_state *st, int state, bool only_aligned)
{
nfs_do_write(st->shared_inode, st->shared_offset, st->aligned_size, [&](cluster_op_t *op)
{
if (only_aligned)
op->iov.push_back(st->aligned_buf, st->aligned_size);
else
{
op->iov.push_back(st->aligned_buf, sizeof(shared_file_header_t) + st->offset);
op->iov.push_back(st->buf, st->size);
op->iov.push_back(
st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size,
st->aligned_size - (sizeof(shared_file_header_t) + st->offset + st->size)
);
}
}, st, state);
}
static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t offset, uint64_t shared_alloc, int state)
{ {
auto alignment = st->self->parent->pool_alignment; auto alignment = st->self->parent->pool_alignment;
uint64_t end = (offset+st->size);
uint8_t *good_buf = st->buf; uint8_t *good_buf = st->buf;
uint64_t good_offset = offset; uint64_t good_offset = offset;
uint64_t good_size = st->size; uint64_t good_size = st->size;
bool begin_shdr = false;
uint64_t end_pad = 0;
st->waiting++; st->waiting++;
st->rmw[0].st = NULL; st->rmw[0].st = NULL;
st->rmw[1].st = NULL; st->rmw[1].st = NULL;
if (offset % alignment) if (offset % alignment)
{ {
// Requires read-modify-write in the beginning if (shared_alloc && st->offset == 0 && (offset % alignment) == sizeof(shared_file_header_t))
auto s = (alignment - (offset % alignment));
if (good_size > s)
{ {
good_buf += s; // RMW can be skipped at shared beginning
good_offset += s; st->shdr = {
good_size -= s; .magic = SHARED_FILE_MAGIC_V1,
.inode = st->ino,
.alloc = shared_alloc,
};
begin_shdr = true;
good_offset -= sizeof(shared_file_header_t);
offset = 0;
} }
else else
good_size = 0; {
s = s > st->size ? st->size : s; // Requires read-modify-write in the beginning
st->rmw[0] = { auto s = (alignment - (offset % alignment));
.st = st, if (good_size > s)
.continue_state = state, {
.ino = ino, good_buf += s;
.offset = offset, good_offset += s;
.buf = st->buf, good_size -= s;
.size = s, }
}; else
// FIXME: skip rmw at shared beginning good_size = 0;
nfs_do_rmw(&st->rmw[0]); s = s > st->size ? st->size : s;
st->rmw[0] = {
.st = st,
.continue_state = state,
.ino = ino,
.offset = offset,
.buf = st->buf,
.size = s,
};
nfs_do_rmw(&st->rmw[0]);
}
} }
if ((offset+st->size) % alignment) if ((end % alignment) &&
(offset == 0 || end/alignment > (offset-1)/alignment))
{ {
// Requires read-modify-write in the end // Requires read-modify-write in the end
auto s = ((offset+st->size) % alignment); assert(st->offset+st->size <= st->new_size);
if (good_size > s) if (st->offset+st->size == st->new_size)
good_size -= s;
else
good_size = 0;
if ((offset+st->size)/alignment > offset/alignment)
{ {
// rmw can be skipped at end - we can just zero pad the request
end_pad = alignment - (end % alignment);
}
else
{
auto s = (end % alignment);
if (good_size > s)
good_size -= s;
else
good_size = 0;
st->rmw[1] = { st->rmw[1] = {
.st = st, .st = st,
.continue_state = state, .continue_state = state,
.ino = ino, .ino = ino,
.offset = offset + st->size-s, .offset = end - s,
.buf = st->buf + st->size-s, .buf = st->buf + st->size - s,
.size = s, .size = s,
}; };
// FIXME: skip rmw at end
nfs_do_rmw(&st->rmw[1]); nfs_do_rmw(&st->rmw[1]);
} }
} }
if (good_size > 0) if (good_size > 0 || end_pad > 0 || begin_shdr)
{ {
// Normal write // Normal write
nfs_do_write(ino, good_offset, good_buf, good_size, st, state); nfs_do_write(ino, good_offset, (begin_shdr ? sizeof(shared_file_header_t) : 0)+good_size+end_pad, [&](cluster_op_t *op)
{
if (begin_shdr)
op->iov.push_back(&st->shdr, sizeof(shared_file_header_t));
op->iov.push_back(good_buf, good_size);
if (end_pad)
op->iov.push_back(st->self->parent->kvfs->zero_block.data(), end_pad);
}, st, state);
} }
st->waiting--; st->waiting--;
if (!st->waiting) if (!st->waiting)
@ -631,7 +676,7 @@ resume_1:
st->ientry["empty"].bool_value() && st->ientry["empty"].bool_value() &&
(st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->shared_inode_threshold || (st->ientry["size"].uint64_value() + sizeof(shared_file_header_t)) < st->self->parent->shared_inode_threshold ||
st->ientry["shared_ino"].uint64_value() != 0 && st->ientry["shared_ino"].uint64_value() != 0 &&
st->ientry["shared_alloc"].uint64_value() < align_shared_size(st->self, st->offset+st->size)) st->ientry["shared_alloc"].uint64_value() < sizeof(shared_file_header_t)+st->offset+st->size)
{ {
// Either empty, or shared and requires moving into a larger place (redirect-write) // Either empty, or shared and requires moving into a larger place (redirect-write)
allocate_shared_inode(st, 2, st->new_size); allocate_shared_inode(st, 2, st->new_size);
@ -646,7 +691,7 @@ resume_2:
resume_3: resume_3:
if (!nfs_do_shared_readmodify(st, 3, state, false)) if (!nfs_do_shared_readmodify(st, 3, state, false))
return; return;
nfs_do_shared_write(st, 4); // FIXME assemble from parts, do not copy? nfs_do_shared_write(st, 4, false);
return; return;
resume_4: resume_4:
if (st->res < 0) if (st->res < 0)
@ -669,7 +714,7 @@ resume_5:
{ {
st->res2 = st->res; st->res2 = st->res;
memset(st->aligned_buf, 0, st->aligned_size); memset(st->aligned_buf, 0, st->aligned_size);
nfs_do_shared_write(st, 6); nfs_do_shared_write(st, 6, true);
return; return;
resume_6: resume_6:
free(st->aligned_buf); free(st->aligned_buf);
@ -689,11 +734,12 @@ resume_6:
cb(0); cb(0);
return; return;
} }
else if (st->ientry["shared_ino"].uint64_value() > 0) else if (st->ientry["shared_ino"].uint64_value() != 0)
{ {
// Non-empty, shared, can be updated in-place // Non-empty, shared, can be updated in-place
nfs_do_align_write(st, st->ientry["shared_ino"].uint64_value(), nfs_do_align_write(st, st->ientry["shared_ino"].uint64_value(),
st->ientry["shared_offset"].uint64_value() + sizeof(shared_file_header_t) + st->offset, 7); st->ientry["shared_offset"].uint64_value() + sizeof(shared_file_header_t) + st->offset,
st->ientry["shared_alloc"].uint64_value(), 7);
return; return;
resume_7: resume_7:
if (st->res == 0 && st->stable && !st->was_immediate) if (st->res == 0 && st->stable && !st->was_immediate)
@ -763,7 +809,7 @@ resume_12:
st->ientry_text = new_unshared_ientry(st); st->ientry_text = new_unshared_ientry(st);
} }
// Non-shared write // Non-shared write
nfs_do_align_write(st, st->ino, st->offset, 13); nfs_do_align_write(st, st->ino, st->offset, 0, 13);
return; return;
resume_13: resume_13:
if (st->res == 0 && st->stable && !st->was_immediate) if (st->res == 0 && st->stable && !st->was_immediate)

View File

@ -213,7 +213,12 @@ void nfs_proxy_t::run(json11::Json cfg)
fs_base_inode = ((uint64_t)default_pool_id << (64-POOL_ID_BITS)); fs_base_inode = ((uint64_t)default_pool_id << (64-POOL_ID_BITS));
fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1; fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1;
shared_inode_threshold = pool_block_size; shared_inode_threshold = pool_block_size;
if (!cfg["shared_inode_threshold"].is_null())
{
shared_inode_threshold = cfg["shared_inode_threshold"].uint64_value();
}
kvfs = new kv_fs_state_t; kvfs = new kv_fs_state_t;
kvfs->zero_block.resize(pool_block_size);
} }
// Self-register portmap and NFS // Self-register portmap and NFS
pmap.reg_ports.insert((portmap_id_t){ pmap.reg_ports.insert((portmap_id_t){

View File

@ -40,7 +40,15 @@ cp ./testdata/nfs/f1 ./testdata/f1_nfs
diff ./testdata/f1_90k ./testdata/nfs/f1 diff ./testdata/f1_90k ./testdata/nfs/f1
format_green "90K data ok" format_green "90K data ok"
# move it inplace # test partial shared overwrite
dd if=/dev/urandom of=./testdata/f1_90k bs=9317 count=1 seek=5 conv=notrunc
dd if=./testdata/f1_90k of=./testdata/nfs/f1 bs=9317 count=1 skip=5 seek=5 conv=notrunc
sudo umount ./testdata/nfs/
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
diff ./testdata/f1_90k ./testdata/nfs/f1
format_green "partial inplace shared overwrite ok"
# move it to a larger shared space
dd if=/dev/urandom of=./testdata/f1_110k bs=110k count=1 dd if=/dev/urandom of=./testdata/f1_110k bs=110k count=1
cp testdata/f1_110k ./testdata/nfs/f1 cp testdata/f1_110k ./testdata/nfs/f1
sudo umount ./testdata/nfs/ sudo umount ./testdata/nfs/