forked from vitalif/vitastor
Make pg_stripe_size a per-pool config
parent
ba74eece4a
commit
9f2a948712
|
@ -160,15 +160,6 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
|||
{
|
||||
throw std::runtime_error("Bad block size");
|
||||
}
|
||||
// FIXME: pg_stripe_size may be a per-pool config
|
||||
if (config.find("pg_stripe_size") != config.end())
|
||||
{
|
||||
pg_stripe_size = config["pg_stripe_size"].uint64_value();
|
||||
}
|
||||
if (!pg_stripe_size)
|
||||
{
|
||||
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
|
||||
}
|
||||
if (config["immediate_commit"] == "all")
|
||||
{
|
||||
// Cluster-wide immediate_commit mode
|
||||
|
@ -473,7 +464,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
|||
int i = 0;
|
||||
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
|
||||
{
|
||||
pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pool_cfg.real_pg_count + 1;
|
||||
pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
|
||||
uint64_t begin = (op->offset < stripe ? stripe : op->offset);
|
||||
uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
|
||||
? (stripe + pg_block_size) : (op->offset + op->len);
|
||||
|
|
|
@ -9,7 +9,6 @@
|
|||
#define MIN_BLOCK_SIZE 4*1024
|
||||
#define MAX_BLOCK_SIZE 128*1024*1024
|
||||
#define DEFAULT_BLOCK_SIZE 128*1024
|
||||
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
|
||||
#define DEFAULT_DISK_ALIGNMENT 4096
|
||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||
#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
|
||||
|
@ -54,7 +53,6 @@ class cluster_client_t
|
|||
timerfd_manager_t *tfd;
|
||||
ring_loop_t *ringloop;
|
||||
|
||||
uint64_t pg_stripe_size = 0;
|
||||
uint64_t bs_block_size = 0;
|
||||
uint64_t bs_disk_alignment = 0;
|
||||
uint64_t bs_bitmap_granularity = 0;
|
||||
|
|
|
@ -358,6 +358,11 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
|
|||
parsed_cfg.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
|
||||
parsed_cfg.pg_count = pool_item.second["pg_count"].uint64_value();
|
||||
parsed_cfg.failure_domain = pool_item.second["failure_domain"].string_value();
|
||||
parsed_cfg.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
|
||||
if (!parsed_cfg.pg_stripe_size)
|
||||
{
|
||||
parsed_cfg.pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
|
||||
}
|
||||
parsed_cfg.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
|
||||
if (!parsed_cfg.max_osd_combinations)
|
||||
{
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
#define ETCD_SLOW_TIMEOUT 5000
|
||||
#define ETCD_QUICK_TIMEOUT 1000
|
||||
|
||||
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
|
||||
|
||||
struct json_kv_t
|
||||
{
|
||||
std::string key;
|
||||
|
@ -46,6 +48,7 @@ struct pool_config_t
|
|||
uint64_t real_pg_count;
|
||||
std::string failure_domain;
|
||||
uint64_t max_osd_combinations;
|
||||
uint64_t pg_stripe_size;
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
};
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ class Mon
|
|||
block_size: 131072,
|
||||
disk_alignment: 4096,
|
||||
bitmap_granularity: 4096,
|
||||
pg_stripe_size: 4194304,
|
||||
immediate_commit: false, // 'all' or 'small'
|
||||
client_dirty_limit: 33554432,
|
||||
peer_connect_interval: 5, // seconds. min: 1
|
||||
|
@ -101,6 +100,7 @@ class Mon
|
|||
pg_count: 100,
|
||||
failure_domain: 'host',
|
||||
max_osd_combinations: 10000,
|
||||
pg_stripe_size: 4194304,
|
||||
// FIXME add device classes/tags
|
||||
},
|
||||
...
|
||||
|
|
6
osd.cpp
6
osd.cpp
|
@ -83,12 +83,6 @@ void osd_t::parse_config(blockstore_config_t & config)
|
|||
if (client_queue_depth < 128)
|
||||
client_queue_depth = 128;
|
||||
}
|
||||
if (config.find("pg_stripe_size") != config.end())
|
||||
{
|
||||
pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10);
|
||||
if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0)
|
||||
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
|
||||
}
|
||||
recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
|
||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
|
|
4
osd.h
4
osd.h
|
@ -37,7 +37,6 @@
|
|||
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
||||
#define MAX_RECOVERY_QUEUE 2048
|
||||
#define DEFAULT_RECOVERY_QUEUE 4
|
||||
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default
|
||||
|
||||
//#define OSD_STUB
|
||||
|
||||
|
@ -110,7 +109,6 @@ class osd_t
|
|||
int inflight_ops = 0;
|
||||
blockstore_t *bs;
|
||||
uint32_t bs_block_size, bs_disk_alignment;
|
||||
uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
|
||||
ring_loop_t *ringloop;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
|
@ -201,7 +199,7 @@ class osd_t
|
|||
void submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||
|
||||
inline pg_num_t map_to_pg(object_id oid)
|
||||
inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)];
|
||||
if (!pg_count)
|
||||
|
|
|
@ -125,10 +125,11 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||
cancel_primary_write(p.second);
|
||||
}
|
||||
pg.write_queue.clear();
|
||||
uint64_t pg_stripe_size = st_cli.pool_config[pg.pool_id].pg_stripe_size;
|
||||
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
|
||||
{
|
||||
// Forget this PG's unstable writes
|
||||
if (INODE_POOL(it->first.oid.inode) == pg.pool_id && map_to_pg(it->first.oid) == pg.pg_num)
|
||||
if (INODE_POOL(it->first.oid.inode) == pg.pool_id && map_to_pg(it->first.oid, pg_stripe_size) == pg.pg_num)
|
||||
unstable_writes.erase(it++);
|
||||
else
|
||||
it++;
|
||||
|
@ -348,7 +349,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||
op->bs_op = new blockstore_op_t();
|
||||
op->bs_op->opcode = BS_OP_LIST;
|
||||
op->bs_op->oid.stripe = pg_stripe_size;
|
||||
op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
|
||||
op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
|
||||
op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||
op->bs_op->len = pg_counts[ps->pool_id];
|
||||
|
@ -392,7 +393,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||
},
|
||||
.list_pg = ps->pg_num,
|
||||
.pg_count = pg_counts[ps->pool_id],
|
||||
.pg_stripe_size = pg_stripe_size,
|
||||
.pg_stripe_size = st_cli.pool_config[ps->pool_id].pg_stripe_size,
|
||||
.min_inode = ((uint64_t)(ps->pool_id) << (64 - POOL_ID_BITS)),
|
||||
.max_inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1,
|
||||
},
|
||||
|
|
|
@ -25,8 +25,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
// oid.stripe = starting offset of the parity stripe
|
||||
.stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
|
||||
};
|
||||
// FIXME: pg_stripe_size may be a per-pool config
|
||||
pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_counts[pool_id] + 1;
|
||||
pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1;
|
||||
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
|
||||
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
|
||||
{
|
||||
|
@ -604,7 +603,10 @@ resume_6:
|
|||
{
|
||||
// Except those from peered PGs
|
||||
auto & w = op_data->unstable_writes[i];
|
||||
pool_pg_num_t wpg = { .pool_id = INODE_POOL(w.oid.inode), .pg_num = map_to_pg(w.oid) };
|
||||
pool_pg_num_t wpg = {
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
};
|
||||
if (pgs[wpg].state & PG_ACTIVE)
|
||||
{
|
||||
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
|
||||
|
|
|
@ -21,7 +21,8 @@
|
|||
#define PG_HAS_INVALID (1<<11)
|
||||
#define PG_LEFT_ON_DEAD (1<<12)
|
||||
|
||||
// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
#define STRIPE_MASK ((uint64_t)4096 - 1)
|
||||
|
||||
// OSD object states
|
||||
|
|
Loading…
Reference in New Issue