forked from vitalif/vitastor
Actual snapshot support (untested)
parent
ffe1cd4c79
commit
691f066055
12
mon/mon.js
12
mon/mon.js
|
@ -24,6 +24,7 @@ const etcd_allow = new RegExp('^'+[
|
||||||
'config/pools',
|
'config/pools',
|
||||||
'config/osd/[1-9]\\d*',
|
'config/osd/[1-9]\\d*',
|
||||||
'config/pgs',
|
'config/pgs',
|
||||||
|
'config/inode/[1-9]\\d*/[1-9]\\d*',
|
||||||
'osd/state/[1-9]\\d*',
|
'osd/state/[1-9]\\d*',
|
||||||
'osd/stats/[1-9]\\d*',
|
'osd/stats/[1-9]\\d*',
|
||||||
'osd/inodestats/[1-9]\\d*',
|
'osd/inodestats/[1-9]\\d*',
|
||||||
|
@ -144,6 +145,17 @@ const etcd_tree = {
|
||||||
}
|
}
|
||||||
}, */
|
}, */
|
||||||
pgs: {},
|
pgs: {},
|
||||||
|
/* inode: {
|
||||||
|
<pool_id>: {
|
||||||
|
<inode_t>: {
|
||||||
|
name: string,
|
||||||
|
parent_pool?: <pool_id>,
|
||||||
|
parent_id?: <inode_t>,
|
||||||
|
readonly?: boolean,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, */
|
||||||
|
inode: {},
|
||||||
},
|
},
|
||||||
osd: {
|
osd: {
|
||||||
state: {
|
state: {
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
|
|
||||||
|
#define SCRAP_BUFFER_SIZE 4*1024*1024
|
||||||
#define PART_SENT 1
|
#define PART_SENT 1
|
||||||
#define PART_DONE 2
|
#define PART_DONE 2
|
||||||
#define PART_ERROR 4
|
#define PART_ERROR 4
|
||||||
|
@ -63,10 +64,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||||
st_cli.parse_config(config);
|
st_cli.parse_config(config);
|
||||||
st_cli.load_global_config();
|
st_cli.load_global_config();
|
||||||
|
|
||||||
// Temporary implementation: discard all bitmaps
|
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
||||||
// It will be of course replaced by the implementation of snapshots
|
scrap_buffer = malloc_or_die(scrap_buffer_size);
|
||||||
scrap_bitmap_size = 4096;
|
|
||||||
scrap_bitmap = malloc_or_die(scrap_bitmap_size);
|
|
||||||
|
|
||||||
if (ringloop)
|
if (ringloop)
|
||||||
{
|
{
|
||||||
|
@ -91,7 +90,22 @@ cluster_client_t::~cluster_client_t()
|
||||||
{
|
{
|
||||||
ringloop->unregister_consumer(&consumer);
|
ringloop->unregister_consumer(&consumer);
|
||||||
}
|
}
|
||||||
free(scrap_bitmap);
|
free(scrap_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
cluster_op_t::~cluster_op_t()
|
||||||
|
{
|
||||||
|
if (buf)
|
||||||
|
{
|
||||||
|
free(buf);
|
||||||
|
buf = NULL;
|
||||||
|
}
|
||||||
|
if (bitmap_buf)
|
||||||
|
{
|
||||||
|
free(bitmap_buf);
|
||||||
|
part_bitmaps = NULL;
|
||||||
|
bitmap_buf = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cluster_client_t::continue_ops(bool up_retry)
|
void cluster_client_t::continue_ops(bool up_retry)
|
||||||
|
@ -193,6 +207,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
||||||
{
|
{
|
||||||
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||||
}
|
}
|
||||||
|
bs_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
|
||||||
uint32_t block_order;
|
uint32_t block_order;
|
||||||
if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
|
if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
|
||||||
{
|
{
|
||||||
|
@ -269,7 +284,7 @@ void cluster_client_t::on_change_hook(json11::Json::object & changes)
|
||||||
for (auto op: op_queue)
|
for (auto op: op_queue)
|
||||||
{
|
{
|
||||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
|
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
|
||||||
INODE_POOL(op->inode) == pool_item.first)
|
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||||
{
|
{
|
||||||
op->needs_reslice = true;
|
op->needs_reslice = true;
|
||||||
}
|
}
|
||||||
|
@ -334,6 +349,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
op->cur_inode = op->inode;
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
|
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
|
||||||
{
|
{
|
||||||
|
@ -446,7 +462,7 @@ void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
|
||||||
cluster_op_t *op = new cluster_op_t;
|
cluster_op_t *op = new cluster_op_t;
|
||||||
op->flags = OP_FLUSH_BUFFER;
|
op->flags = OP_FLUSH_BUFFER;
|
||||||
op->opcode = OSD_OP_WRITE;
|
op->opcode = OSD_OP_WRITE;
|
||||||
op->inode = oid.inode;
|
op->cur_inode = op->inode = oid.inode;
|
||||||
op->offset = oid.stripe;
|
op->offset = oid.stripe;
|
||||||
op->len = wr->len;
|
op->len = wr->len;
|
||||||
op->iov.push_back(wr->buf, wr->len);
|
op->iov.push_back(wr->buf, wr->len);
|
||||||
|
@ -484,7 +500,7 @@ resume_0:
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
pool_id_t pool_id = INODE_POOL(op->inode);
|
pool_id_t pool_id = INODE_POOL(op->cur_inode);
|
||||||
if (!pool_id)
|
if (!pool_id)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
|
@ -500,6 +516,13 @@ resume_0:
|
||||||
}
|
}
|
||||||
if (op->opcode == OSD_OP_WRITE)
|
if (op->opcode == OSD_OP_WRITE)
|
||||||
{
|
{
|
||||||
|
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||||
|
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
|
||||||
|
{
|
||||||
|
op->retval = -EINVAL;
|
||||||
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
||||||
{
|
{
|
||||||
copy_write(op, dirty_buffers);
|
copy_write(op, dirty_buffers);
|
||||||
|
@ -558,6 +581,22 @@ resume_3:
|
||||||
// Finished successfully
|
// Finished successfully
|
||||||
// Even if the PG count has changed in meanwhile we treat it as success
|
// Even if the PG count has changed in meanwhile we treat it as success
|
||||||
// because if some operations were invalid for the new PG count we'd get errors
|
// because if some operations were invalid for the new PG count we'd get errors
|
||||||
|
bool is_read = op->opcode == OSD_OP_READ;
|
||||||
|
if (is_read)
|
||||||
|
{
|
||||||
|
// Check parent inode
|
||||||
|
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||||
|
if (ino_it != st_cli.inode_config.end() &&
|
||||||
|
ino_it->second.parent_id)
|
||||||
|
{
|
||||||
|
// Continue reading from the parent inode
|
||||||
|
// FIXME: This obviously requires optimizations for long snapshot chains
|
||||||
|
op->cur_inode = ino_it->second.parent_id;
|
||||||
|
op->parts.clear();
|
||||||
|
op->done_count = 0;
|
||||||
|
goto resume_1;
|
||||||
|
}
|
||||||
|
}
|
||||||
op->retval = op->len;
|
op->retval = op->len;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -590,51 +629,130 @@ resume_3:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t &iov_pos, osd_op_buf_list_t &iov, void *scrap, int scrap_len)
|
||||||
|
{
|
||||||
|
int left = size;
|
||||||
|
while (left > 0 && iov_idx < op->iov.count)
|
||||||
|
{
|
||||||
|
int cur_left = op->iov.buf[iov_idx].iov_len - iov_pos;
|
||||||
|
if (cur_left < left)
|
||||||
|
{
|
||||||
|
if (!skip)
|
||||||
|
{
|
||||||
|
iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, cur_left);
|
||||||
|
}
|
||||||
|
left -= cur_left;
|
||||||
|
iov_pos = 0;
|
||||||
|
iov_idx++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (!skip)
|
||||||
|
{
|
||||||
|
iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
|
||||||
|
}
|
||||||
|
iov_pos += left;
|
||||||
|
left = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(left == 0);
|
||||||
|
if (skip && scrap_len > 0)
|
||||||
|
{
|
||||||
|
// All skipped ranges are read into the same useless buffer
|
||||||
|
left = size;
|
||||||
|
while (left > 0)
|
||||||
|
{
|
||||||
|
int cur_left = scrap_len < left ? scrap_len : left;
|
||||||
|
iov.push_back(scrap, cur_left);
|
||||||
|
left -= cur_left;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void cluster_client_t::slice_rw(cluster_op_t *op)
|
void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||||
{
|
{
|
||||||
// Slice the request into individual object stripe requests
|
// Slice the request into individual object stripe requests
|
||||||
// Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
|
// Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
|
||||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
||||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||||
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
||||||
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
|
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
|
||||||
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
|
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
||||||
|
if (op->opcode == OSD_OP_READ)
|
||||||
|
{
|
||||||
|
// Allocate memory for the bitmap
|
||||||
|
unsigned object_bitmap_size = ((op->len / bs_bitmap_granularity + 7) / 8);
|
||||||
|
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
||||||
|
unsigned bitmap_mem = object_bitmap_size + (bs_bitmap_size * pg_data_size) * op->parts.size();
|
||||||
|
if (op->bitmap_buf_size < bitmap_mem)
|
||||||
|
{
|
||||||
|
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
||||||
|
if (!op->bitmap_buf_size)
|
||||||
|
{
|
||||||
|
// First allocation
|
||||||
|
memset(op->bitmap_buf, 0, object_bitmap_size);
|
||||||
|
}
|
||||||
|
op->part_bitmaps = op->bitmap_buf + object_bitmap_size;
|
||||||
|
op->bitmap_buf_size = bitmap_mem;
|
||||||
|
}
|
||||||
|
}
|
||||||
int iov_idx = 0;
|
int iov_idx = 0;
|
||||||
size_t iov_pos = 0;
|
size_t iov_pos = 0;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
|
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
|
||||||
{
|
{
|
||||||
pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
|
pg_num_t pg_num = (op->cur_inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
|
||||||
uint64_t begin = (op->offset < stripe ? stripe : op->offset);
|
uint64_t begin = (op->offset < stripe ? stripe : op->offset);
|
||||||
uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
|
uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
|
||||||
? (stripe + pg_block_size) : (op->offset + op->len);
|
? (stripe + pg_block_size) : (op->offset + op->len);
|
||||||
op->parts[i] = (cluster_op_part_t){
|
op->parts[i].iov.reset();
|
||||||
.parent = op,
|
if (op->cur_inode != op->inode)
|
||||||
.offset = begin,
|
|
||||||
.len = (uint32_t)(end - begin),
|
|
||||||
.pg_num = pg_num,
|
|
||||||
.flags = 0,
|
|
||||||
};
|
|
||||||
int left = end-begin;
|
|
||||||
while (left > 0 && iov_idx < op->iov.count)
|
|
||||||
{
|
{
|
||||||
if (op->iov.buf[iov_idx].iov_len - iov_pos < left)
|
// Read remaining parts from upper layers
|
||||||
|
uint64_t prev = begin, cur = begin;
|
||||||
|
bool skip_prev = true;
|
||||||
|
while (cur < end)
|
||||||
{
|
{
|
||||||
op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, op->iov.buf[iov_idx].iov_len - iov_pos);
|
unsigned bmp_loc = (cur - op->offset)/bs_bitmap_granularity;
|
||||||
left -= (op->iov.buf[iov_idx].iov_len - iov_pos);
|
bool skip = (((*(uint8_t*)(op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
|
||||||
iov_pos = 0;
|
if (skip_prev != skip)
|
||||||
iov_idx++;
|
{
|
||||||
|
if (cur > prev)
|
||||||
|
{
|
||||||
|
if (prev == begin && skip_prev)
|
||||||
|
{
|
||||||
|
begin = cur;
|
||||||
|
// Just advance iov_idx & iov_pos
|
||||||
|
add_iov(cur-prev, true, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
|
||||||
|
}
|
||||||
|
skip_prev = skip;
|
||||||
|
prev = cur;
|
||||||
|
}
|
||||||
|
cur += bs_bitmap_granularity;
|
||||||
}
|
}
|
||||||
|
assert(cur > prev);
|
||||||
|
if (skip_prev)
|
||||||
|
end = prev;
|
||||||
else
|
else
|
||||||
{
|
add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
|
||||||
op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
|
if (end == begin)
|
||||||
iov_pos += left;
|
op->done_count++;
|
||||||
left = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
assert(left == 0);
|
else
|
||||||
|
{
|
||||||
|
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||||
|
}
|
||||||
|
op->parts[i].parent = op;
|
||||||
|
op->parts[i].offset = begin;
|
||||||
|
op->parts[i].len = (uint32_t)(end - begin);
|
||||||
|
op->parts[i].pg_num = pg_num;
|
||||||
|
op->parts[i].osd_num = 0;
|
||||||
|
op->parts[i].flags = 0;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -661,7 +779,7 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
|
||||||
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
{
|
{
|
||||||
auto part = &op->parts[i];
|
auto part = &op->parts[i];
|
||||||
auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
|
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
||||||
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
||||||
if (pg_it != pool_cfg.pg_config.end() &&
|
if (pg_it != pool_cfg.pg_config.end() &&
|
||||||
!pg_it->second.pause && pg_it->second.cur_primary)
|
!pg_it->second.pause && pg_it->second.cur_primary)
|
||||||
|
@ -674,6 +792,9 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
part->osd_num = primary_osd;
|
part->osd_num = primary_osd;
|
||||||
part->flags |= PART_SENT;
|
part->flags |= PART_SENT;
|
||||||
op->inflight_count++;
|
op->inflight_count++;
|
||||||
|
uint64_t pg_bitmap_size = bs_bitmap_size * (
|
||||||
|
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||||
|
);
|
||||||
part->op = (osd_op_t){
|
part->op = (osd_op_t){
|
||||||
.op_type = OSD_OP_OUT,
|
.op_type = OSD_OP_OUT,
|
||||||
.peer_fd = peer_fd,
|
.peer_fd = peer_fd,
|
||||||
|
@ -683,12 +804,12 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
.id = op_id++,
|
.id = op_id++,
|
||||||
.opcode = op->opcode,
|
.opcode = op->opcode,
|
||||||
},
|
},
|
||||||
.inode = op->inode,
|
.inode = op->cur_inode,
|
||||||
.offset = part->offset,
|
.offset = part->offset,
|
||||||
.len = part->len,
|
.len = part->len,
|
||||||
} },
|
} },
|
||||||
.bitmap = scrap_bitmap,
|
.bitmap = op->opcode == OSD_OP_WRITE ? NULL : op->part_bitmaps + pg_bitmap_size*i,
|
||||||
.bitmap_len = scrap_bitmap_size,
|
.bitmap_len = (unsigned)(op->opcode == OSD_OP_WRITE ? 0 : pg_bitmap_size),
|
||||||
.callback = [this, part](osd_op_t *op_part)
|
.callback = [this, part](osd_op_t *op_part)
|
||||||
{
|
{
|
||||||
handle_op_part(part);
|
handle_op_part(part);
|
||||||
|
@ -817,6 +938,16 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
||||||
msgr.outbox_push(&part->op);
|
msgr.outbox_push(&part->op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void mem_or(void *res, const void *r2, unsigned int len)
|
||||||
|
{
|
||||||
|
unsigned int i;
|
||||||
|
for (i = 0; i < len; ++i)
|
||||||
|
{
|
||||||
|
// Hope the compiler vectorizes this
|
||||||
|
((uint8_t*)res)[i] = ((uint8_t*)res)[i] | ((uint8_t*)r2)[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||||
{
|
{
|
||||||
cluster_op_t *op = part->parent;
|
cluster_op_t *op = part->parent;
|
||||||
|
@ -856,9 +987,43 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||||
dirty_osds.insert(part->osd_num);
|
dirty_osds.insert(part->osd_num);
|
||||||
part->flags |= PART_DONE;
|
part->flags |= PART_DONE;
|
||||||
op->done_count++;
|
op->done_count++;
|
||||||
|
if (op->opcode == OSD_OP_READ)
|
||||||
|
{
|
||||||
|
copy_part_bitmap(op, part);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (op->inflight_count == 0)
|
if (op->inflight_count == 0)
|
||||||
{
|
{
|
||||||
continue_ops();
|
continue_ops();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part)
|
||||||
|
{
|
||||||
|
// Copy (OR) bitmap
|
||||||
|
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
||||||
|
uint32_t pg_block_size = bs_block_size * (
|
||||||
|
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||||
|
);
|
||||||
|
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / bs_bitmap_granularity;
|
||||||
|
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / bs_bitmap_granularity;
|
||||||
|
uint32_t part_len = part->op.req.rw.len / bs_bitmap_granularity;
|
||||||
|
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
||||||
|
{
|
||||||
|
// Copy bytes
|
||||||
|
mem_or(op->bitmap_buf + object_offset/8, part->op.bitmap + part_offset/8, part_len/8);
|
||||||
|
object_offset += (part_len & ~0x7);
|
||||||
|
part_offset += (part_len & ~0x7);
|
||||||
|
part_len = (part_len & 0x7);
|
||||||
|
}
|
||||||
|
while (part_len > 0)
|
||||||
|
{
|
||||||
|
// Copy bits
|
||||||
|
(*(uint8_t*)(op->bitmap_buf + (object_offset >> 3))) |= (
|
||||||
|
(((*(uint8_t*)(part->op.bitmap + (part_offset >> 3))) >> (part_offset & 0x7)) & 0x1) << (object_offset & 0x7)
|
||||||
|
);
|
||||||
|
part_offset++;
|
||||||
|
object_offset++;
|
||||||
|
part_len--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -34,15 +34,19 @@ struct cluster_op_t
|
||||||
int retval;
|
int retval;
|
||||||
osd_op_buf_list_t iov;
|
osd_op_buf_list_t iov;
|
||||||
std::function<void(cluster_op_t*)> callback;
|
std::function<void(cluster_op_t*)> callback;
|
||||||
|
~cluster_op_t();
|
||||||
protected:
|
protected:
|
||||||
int flags = 0;
|
int flags = 0;
|
||||||
int state = 0;
|
int state = 0;
|
||||||
|
uint64_t cur_inode; // for snapshot reads
|
||||||
void *buf = NULL;
|
void *buf = NULL;
|
||||||
cluster_op_t *orig_op = NULL;
|
cluster_op_t *orig_op = NULL;
|
||||||
bool needs_reslice = false;
|
bool needs_reslice = false;
|
||||||
bool up_wait = false;
|
bool up_wait = false;
|
||||||
int inflight_count = 0, done_count = 0;
|
int inflight_count = 0, done_count = 0;
|
||||||
std::vector<cluster_op_part_t> parts;
|
std::vector<cluster_op_part_t> parts;
|
||||||
|
void *bitmap_buf = NULL, *part_bitmaps = NULL;
|
||||||
|
unsigned bitmap_buf_size = 0;
|
||||||
friend class cluster_client_t;
|
friend class cluster_client_t;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -60,7 +64,7 @@ class cluster_client_t
|
||||||
ring_loop_t *ringloop;
|
ring_loop_t *ringloop;
|
||||||
|
|
||||||
uint64_t bs_block_size = 0;
|
uint64_t bs_block_size = 0;
|
||||||
uint64_t bs_bitmap_granularity = 0;
|
uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0;
|
||||||
std::map<pool_id_t, uint64_t> pg_counts;
|
std::map<pool_id_t, uint64_t> pg_counts;
|
||||||
bool immediate_commit = false;
|
bool immediate_commit = false;
|
||||||
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
||||||
|
@ -77,8 +81,8 @@ class cluster_client_t
|
||||||
std::set<osd_num_t> dirty_osds;
|
std::set<osd_num_t> dirty_osds;
|
||||||
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
||||||
|
|
||||||
void *scrap_bitmap = NULL;
|
void *scrap_buffer = NULL;
|
||||||
unsigned scrap_bitmap_size = 0;
|
unsigned scrap_buffer_size = 0;
|
||||||
|
|
||||||
bool pgs_loaded = false;
|
bool pgs_loaded = false;
|
||||||
ring_consumer_t consumer;
|
ring_consumer_t consumer;
|
||||||
|
@ -112,4 +116,5 @@ protected:
|
||||||
int continue_sync(cluster_op_t *op);
|
int continue_sync(cluster_op_t *op);
|
||||||
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
||||||
void handle_op_part(cluster_op_part_t *part);
|
void handle_op_part(cluster_op_part_t *part);
|
||||||
|
void copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part);
|
||||||
};
|
};
|
||||||
|
|
|
@ -642,4 +642,47 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (key.substr(0, etcd_prefix.length()+14) == etcd_prefix+"/config/inode/")
|
||||||
|
{
|
||||||
|
// <etcd_prefix>/config/inode/%d/%d
|
||||||
|
uint64_t pool_id = 0;
|
||||||
|
uint64_t inode_num = 0;
|
||||||
|
char null_byte = 0;
|
||||||
|
sscanf(key.c_str() + etcd_prefix.length()+14, "%lu/%lu%c", &pool_id, &inode_num, &null_byte);
|
||||||
|
if (!pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)) || null_byte != 0)
|
||||||
|
{
|
||||||
|
printf("Bad etcd key %s, ignoring\n", key.c_str());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inode_num |= (pool_id << (64-POOL_ID_BITS));
|
||||||
|
if (!value.is_object())
|
||||||
|
{
|
||||||
|
this->inode_config.erase(inode_num);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inode_t parent_inode_num = value["parent_id"].uint64_value();
|
||||||
|
if (parent_inode_num && !(parent_inode_num >> (64-POOL_ID_BITS)))
|
||||||
|
{
|
||||||
|
uint64_t parent_pool_id = value["parent_pool"].uint64_value();
|
||||||
|
if (parent_pool_id >= POOL_ID_MAX)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
|
||||||
|
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
|
||||||
|
);
|
||||||
|
parent_inode_num = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
parent_inode_num |= parent_pool_id << (64-POOL_ID_BITS);
|
||||||
|
}
|
||||||
|
this->inode_config[inode_num] = (inode_config_t){
|
||||||
|
.name = value["name"].string_value(),
|
||||||
|
.parent_id = parent_inode_num,
|
||||||
|
.readonly = value["readonly"].bool_value(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,6 +52,13 @@ struct pool_config_t
|
||||||
std::map<pg_num_t, pg_config_t> pg_config;
|
std::map<pg_num_t, pg_config_t> pg_config;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct inode_config_t
|
||||||
|
{
|
||||||
|
std::string name;
|
||||||
|
inode_t parent_id;
|
||||||
|
bool readonly;
|
||||||
|
};
|
||||||
|
|
||||||
struct websocket_t;
|
struct websocket_t;
|
||||||
|
|
||||||
struct etcd_state_client_t
|
struct etcd_state_client_t
|
||||||
|
@ -70,6 +77,7 @@ public:
|
||||||
uint64_t etcd_watch_revision = 0;
|
uint64_t etcd_watch_revision = 0;
|
||||||
std::map<pool_id_t, pool_config_t> pool_config;
|
std::map<pool_id_t, pool_config_t> pool_config;
|
||||||
std::map<osd_num_t, json11::Json> peer_states;
|
std::map<osd_num_t, json11::Json> peer_states;
|
||||||
|
std::map<inode_t, inode_config_t> inode_config;
|
||||||
|
|
||||||
std::function<void(json11::Json::object &)> on_change_hook;
|
std::function<void(json11::Json::object &)> on_change_hook;
|
||||||
std::function<void(json11::Json::object &)> on_load_config_hook;
|
std::function<void(json11::Json::object &)> on_load_config_hook;
|
||||||
|
|
|
@ -161,6 +161,7 @@ struct osd_op_t
|
||||||
osd_any_reply_t reply;
|
osd_any_reply_t reply;
|
||||||
blockstore_op_t *bs_op = NULL;
|
blockstore_op_t *bs_op = NULL;
|
||||||
void *buf = NULL;
|
void *buf = NULL;
|
||||||
|
// bitmap, bitmap_len, bmp_data are only meaningful for reads
|
||||||
void *bitmap = NULL;
|
void *bitmap = NULL;
|
||||||
unsigned bitmap_len = 0;
|
unsigned bitmap_len = 0;
|
||||||
unsigned bmp_data = 0;
|
unsigned bmp_data = 0;
|
||||||
|
|
|
@ -6,12 +6,14 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
|
||||||
|
typedef uint64_t inode_t;
|
||||||
|
|
||||||
// 16 bytes per object/stripe id
|
// 16 bytes per object/stripe id
|
||||||
// stripe = (start of the parity stripe + peer role)
|
// stripe = (start of the parity stripe + peer role)
|
||||||
// i.e. for example (256KB + one of 0,1,2)
|
// i.e. for example (256KB + one of 0,1,2)
|
||||||
struct __attribute__((__packed__)) object_id
|
struct __attribute__((__packed__)) object_id
|
||||||
{
|
{
|
||||||
uint64_t inode;
|
inode_t inode;
|
||||||
uint64_t stripe;
|
uint64_t stripe;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue