Compare commits
2 Commits
262c581400
...
66c9271cbd
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 66c9271cbd | |
Vitaliy Filippov | 7b37ba921d |
|
@ -56,6 +56,7 @@ const etcd_tree = {
|
|||
osd_out_time: 600, // seconds. min: 0
|
||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
||||
use_old_pg_combinator: false,
|
||||
osd_backfillfull_ratio: 0.99,
|
||||
// client and osd
|
||||
tcp_header_buffer_size: 65536,
|
||||
use_sync_send_recv: false,
|
||||
|
|
37
mon/mon.js
37
mon/mon.js
|
@ -74,6 +74,7 @@ class Mon
|
|||
this.state = JSON.parse(JSON.stringify(etcd_tree));
|
||||
this.prev_stats = { osd_stats: {}, osd_diff: {} };
|
||||
this.recheck_pgs_active = false;
|
||||
this.updating_total_stats = false;
|
||||
this.watcher_active = false;
|
||||
this.old_pg_config = false;
|
||||
this.old_pg_stats_seen = false;
|
||||
|
@ -658,7 +659,13 @@ class Mon
|
|||
this.etcd_watch_revision, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history);
|
||||
}
|
||||
new_pg_config.hash = tree_hash;
|
||||
return await this.save_pg_config(new_pg_config, etcd_request);
|
||||
const { backfillfull_pools } = sum_object_counts({ ...this.state, pg: { ...this.state.pg, config: new_pg_config } }, this.config);
|
||||
new_pg_config.backfillfull_pools = backfillfull_pools.length ? backfillfull_pools : undefined;
|
||||
if (!await this.save_pg_config(new_pg_config, etcd_request))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async save_pg_config(new_pg_config, etcd_request = { compare: [], success: [] })
|
||||
|
@ -730,7 +737,7 @@ class Mon
|
|||
async update_total_stats()
|
||||
{
|
||||
const txn = [];
|
||||
const { object_counts, object_bytes } = sum_object_counts(this.state, this.config);
|
||||
const { object_counts, object_bytes, backfillfull_pools } = sum_object_counts(this.state, this.config);
|
||||
let stats = sum_op_stats(this.state.osd, this.prev_stats);
|
||||
let { inode_stats, seen_pools } = sum_inode_stats(this.state, this.prev_stats);
|
||||
stats.object_counts = object_counts;
|
||||
|
@ -783,6 +790,16 @@ class Mon
|
|||
{
|
||||
await this.etcd.etcd_call('/kv/txn', { success: txn }, this.config.etcd_mon_timeout, 0);
|
||||
}
|
||||
if (!this.recheck_pgs_active &&
|
||||
backfillfull_pools.join(',') != ((this.state.pg.config||{}).no_rebalance_pools||[]).join(','))
|
||||
{
|
||||
console.log(
|
||||
(backfillfull_pools.length ? 'Pool(s) '+backfillfull_pools.join(', ') : 'No pools')+
|
||||
' are backfillfull, applying rebalance configuration'
|
||||
);
|
||||
const new_pg_config = { ...this.state.pg.config, backfillfull_pools: backfillfull_pools.length ? backfillfull_pools : undefined };
|
||||
await this.save_pg_config(new_pg_config);
|
||||
}
|
||||
}
|
||||
|
||||
schedule_update_stats()
|
||||
|
@ -794,7 +811,21 @@ class Mon
|
|||
this.stats_timer = setTimeout(() =>
|
||||
{
|
||||
this.stats_timer = null;
|
||||
this.update_total_stats().catch(console.error);
|
||||
if (this.updating_total_stats)
|
||||
{
|
||||
this.schedule_update_stats();
|
||||
return;
|
||||
}
|
||||
this.updating_total_stats = true;
|
||||
try
|
||||
{
|
||||
this.update_total_stats().catch(console.error);
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
console.error(e);
|
||||
}
|
||||
this.updating_total_stats = false;
|
||||
}, this.config.mon_stats_timeout);
|
||||
}
|
||||
|
||||
|
|
32
mon/stats.js
32
mon/stats.js
|
@ -109,6 +109,8 @@ function sum_object_counts(state, global_config)
|
|||
pgstats[pool_id] = { ...(state.pg.stats[pool_id] || {}), ...(pgstats[pool_id] || {}) };
|
||||
}
|
||||
}
|
||||
const pool_per_osd = {};
|
||||
const clean_per_osd = {};
|
||||
for (const pool_id in pgstats)
|
||||
{
|
||||
let object_size = 0;
|
||||
|
@ -143,10 +145,38 @@ function sum_object_counts(state, global_config)
|
|||
object_bytes[k] += BigInt(st[k+'_count']) * object_size;
|
||||
}
|
||||
}
|
||||
if (st.object_count)
|
||||
{
|
||||
for (const pg_osd in (((state.pg.config.items||{})[pool_id]||{})[pg_num]||{}).osd_set||[])
|
||||
{
|
||||
if (!(pg_osd in clean_per_osd))
|
||||
{
|
||||
clean_per_osd[pg_osd] = 0n;
|
||||
}
|
||||
clean_per_osd[pg_osd] += BigInt(st.object_count);
|
||||
pool_per_osd[pg_osd] = pool_per_osd[pg_osd]||{};
|
||||
pool_per_osd[pg_osd][pool_id] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return { object_counts, object_bytes };
|
||||
// If clean_per_osd[osd] is larger than osd capacity then it will fill up during rebalance
|
||||
let backfillfull_pools = {};
|
||||
for (const osd in clean_per_osd)
|
||||
{
|
||||
const st = state.osd.stats[osd];
|
||||
if (st && st.size && st.data_block_size && (BigInt(st.size)/BigInt(st.data_block_size)*
|
||||
BigInt((global_config.osd_backfillfull_ratio||0.99)*1000000)/1000000n) < clean_per_osd[osd])
|
||||
{
|
||||
for (const pool_id in pool_per_osd[osd])
|
||||
{
|
||||
backfillfull_pools[pool_id] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
backfillfull_pools = Object.keys(backfillfull_pools).sort();
|
||||
return { object_counts, object_bytes, backfillfull_pools };
|
||||
}
|
||||
|
||||
// sum_inode_stats(this.state, this.prev_stats)
|
||||
|
|
|
@ -785,7 +785,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||
}
|
||||
for (auto & pool_item: value.object_items())
|
||||
{
|
||||
pool_config_t pc;
|
||||
pool_config_t pc = {};
|
||||
// ID
|
||||
pool_id_t pool_id;
|
||||
char null_byte = 0;
|
||||
|
@ -931,12 +931,28 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||
// Ignore old key if the new one is present
|
||||
return;
|
||||
}
|
||||
for (auto & pool_id_json: value["backfillfull_pools"].array_items())
|
||||
{
|
||||
auto pool_id = pool_id_json.uint64_value();
|
||||
auto pool_it = this->pool_config.find(pool_id);
|
||||
if (pool_it != this->pool_config.end())
|
||||
{
|
||||
pool_it->second.backfillfull |= 2;
|
||||
}
|
||||
}
|
||||
for (auto & pool_item: this->pool_config)
|
||||
{
|
||||
for (auto & pg_item: pool_item.second.pg_config)
|
||||
{
|
||||
pg_item.second.config_exists = false;
|
||||
}
|
||||
// 3 = was 1 and became 1, 0 = was 0 and became 0
|
||||
if (pool_item.second.backfillfull == 2 || pool_item.second.backfillfull == 1)
|
||||
{
|
||||
if (on_change_backfillfull_hook)
|
||||
on_change_backfillfull_hook(pool_item.first);
|
||||
}
|
||||
pool_item.second.backfillfull = pool_item.second.backfillfull >> 1;
|
||||
}
|
||||
for (auto & pool_item: value["items"].object_items())
|
||||
{
|
||||
|
|
|
@ -62,6 +62,7 @@ struct pool_config_t
|
|||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
uint64_t scrub_interval;
|
||||
std::string used_for_fs;
|
||||
int backfillfull;
|
||||
};
|
||||
|
||||
struct inode_config_t
|
||||
|
@ -131,6 +132,7 @@ public:
|
|||
std::function<json11::Json()> load_pgs_checks_hook;
|
||||
std::function<void(bool)> on_load_pgs_hook;
|
||||
std::function<void()> on_change_pool_config_hook;
|
||||
std::function<void(pool_id_t)> on_change_backfillfull_hook;
|
||||
std::function<void(pool_id_t, pg_num_t, osd_num_t)> on_change_pg_state_hook;
|
||||
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
|
||||
std::function<void(osd_num_t)> on_change_osd_state_hook;
|
||||
|
|
|
@ -35,6 +35,7 @@ struct pool_creator_t
|
|||
uint64_t new_pools_mod_rev;
|
||||
json11::Json state_node_tree;
|
||||
json11::Json new_pools;
|
||||
std::map<osd_num_t, json11::Json> osd_stats;
|
||||
|
||||
bool is_done() { return state == 100; }
|
||||
|
||||
|
@ -46,8 +47,6 @@ struct pool_creator_t
|
|||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
|
@ -121,15 +120,15 @@ resume_2:
|
|||
// Get state_node_tree based on node_placement and osd stats
|
||||
{
|
||||
auto node_placement_kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
|
||||
std::map<osd_num_t, json11::Json> osd_stats;
|
||||
timespec tv_now;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||
uint64_t osd_out_time = parent->cli->config["osd_out_time"].uint64_value();
|
||||
if (!osd_out_time)
|
||||
osd_out_time = 600;
|
||||
osd_stats.clear();
|
||||
parent->iterate_kvs_1(parent->etcd_result["responses"][1]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t cur_osd, json11::Json value)
|
||||
{
|
||||
if (value["time"].uint64_value()+osd_out_time >= tv_now.tv_sec)
|
||||
if ((uint64_t)value["time"].number_value()+osd_out_time >= tv_now.tv_sec)
|
||||
osd_stats[cur_osd] = value;
|
||||
});
|
||||
state_node_tree = get_state_node_tree(node_placement_kv.value.object_items(), osd_stats);
|
||||
|
@ -175,42 +174,18 @@ resume_3:
|
|||
}
|
||||
}
|
||||
|
||||
// Get stats (for block_size, bitmap_granularity, ...) of osds in state_node_tree
|
||||
{
|
||||
json11::Json::array osd_stats;
|
||||
|
||||
for (auto osd_num: state_node_tree["osds"].array_items())
|
||||
{
|
||||
osd_stats.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/"+osd_num.as_string()) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
|
||||
parent->etcd_txn(json11::Json::object{ { "success", osd_stats } });
|
||||
}
|
||||
|
||||
state = 4;
|
||||
resume_4:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
|
||||
// Filter osds from state_node_tree based on pool parameters and osd stats
|
||||
{
|
||||
std::vector<json11::Json> osd_stats;
|
||||
for (auto & ocr: parent->etcd_result["responses"].array_items())
|
||||
std::vector<json11::Json> filtered_osd_stats;
|
||||
for (auto & osd_num: state_node_tree["osds"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(ocr["response_range"]["kvs"][0]);
|
||||
osd_stats.push_back(kv.value);
|
||||
auto st_it = osd_stats.find(osd_num.uint64_value());
|
||||
if (st_it != osd_stats.end())
|
||||
{
|
||||
filtered_osd_stats.push_back(st_it->second);
|
||||
}
|
||||
}
|
||||
guess_block_size(osd_stats);
|
||||
guess_block_size(filtered_osd_stats);
|
||||
state_node_tree = filter_state_node_tree_by_stats(state_node_tree, osd_stats);
|
||||
}
|
||||
|
||||
|
@ -218,8 +193,7 @@ resume_4:
|
|||
{
|
||||
auto failure_domain = cfg["failure_domain"].string_value() == ""
|
||||
? "host" : cfg["failure_domain"].string_value();
|
||||
uint64_t max_pg_size = get_max_pg_size(state_node_tree["nodes"].object_items(),
|
||||
failure_domain, cfg["root_node"].string_value());
|
||||
uint64_t max_pg_size = get_max_pg_size(state_node_tree, failure_domain, cfg["root_node"].string_value());
|
||||
|
||||
if (cfg["pg_size"].uint64_value() > max_pg_size)
|
||||
{
|
||||
|
@ -411,13 +385,11 @@ resume_8:
|
|||
};
|
||||
}
|
||||
|
||||
// Add osd if necessary
|
||||
if (node_placement.find(osd_num) == node_placement.end())
|
||||
{
|
||||
node_placement[osd_num] = json11::Json::object {
|
||||
{ "parent", osd_host }
|
||||
};
|
||||
}
|
||||
// Add osd
|
||||
node_placement[osd_num] = json11::Json::object {
|
||||
{ "parent", node_placement[osd_num]["parent"].is_null() ? osd_host : node_placement[osd_num]["parent"] },
|
||||
{ "level", "osd" },
|
||||
};
|
||||
}
|
||||
|
||||
return json11::Json::object { { "osds", existing_osds }, { "nodes", node_placement } };
|
||||
|
@ -547,15 +519,13 @@ resume_8:
|
|||
// filtered out by stats parameters (block_size, bitmap_granularity) in
|
||||
// given osd_stats and current pool config.
|
||||
// Requires: state_node_tree["osds"] must match osd_stats 1-1
|
||||
json11::Json filter_state_node_tree_by_stats(const json11::Json & state_node_tree, std::vector<json11::Json> & osd_stats)
|
||||
json11::Json filter_state_node_tree_by_stats(const json11::Json & state_node_tree, std::map<osd_num_t, json11::Json> & osd_stats)
|
||||
{
|
||||
auto & osds = state_node_tree["osds"].array_items();
|
||||
|
||||
// Accepted state_node_tree nodes
|
||||
auto accepted_nodes = state_node_tree["nodes"].object_items();
|
||||
|
||||
// List of accepted osds
|
||||
std::vector<std::string> accepted_osds;
|
||||
json11::Json::array accepted_osds;
|
||||
|
||||
block_size = cfg["block_size"].uint64_value()
|
||||
? cfg["block_size"].uint64_value()
|
||||
|
@ -567,21 +537,25 @@ resume_8:
|
|||
? etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
||||
: parent->cli->st_cli.global_immediate_commit;
|
||||
|
||||
for (size_t i = 0; i < osd_stats.size(); i++)
|
||||
for (auto osd_num_json: state_node_tree["osds"].array_items())
|
||||
{
|
||||
auto & os = osd_stats[i];
|
||||
// Get osd number
|
||||
auto osd_num = osds[i].as_string();
|
||||
auto osd_num = osd_num_json.uint64_value();
|
||||
auto os_it = osd_stats.find(osd_num);
|
||||
if (os_it == osd_stats.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto & os = os_it->second;
|
||||
if (!os["data_block_size"].is_null() && os["data_block_size"] != block_size ||
|
||||
!os["bitmap_granularity"].is_null() && os["bitmap_granularity"] != bitmap_granularity ||
|
||||
!os["immediate_commit"].is_null() &&
|
||||
etcd_state_client_t::parse_immediate_commit(os["immediate_commit"].string_value(), IMMEDIATE_NONE) < immediate_commit)
|
||||
{
|
||||
accepted_nodes.erase(osd_num);
|
||||
accepted_nodes.erase(osd_num_json.as_string());
|
||||
}
|
||||
else
|
||||
{
|
||||
accepted_osds.push_back(osd_num);
|
||||
accepted_osds.push_back(osd_num_json);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -589,81 +563,28 @@ resume_8:
|
|||
}
|
||||
|
||||
// Returns maximum pg_size possible for given node_tree and failure_domain, starting at parent_node
|
||||
uint64_t get_max_pg_size(json11::Json::object node_tree, const std::string & level, const std::string & parent_node)
|
||||
uint64_t get_max_pg_size(json11::Json state_node_tree, const std::string & level, const std::string & root_node)
|
||||
{
|
||||
uint64_t max_pg_sz = 0;
|
||||
|
||||
std::vector<std::string> nodes;
|
||||
|
||||
// Check if parent node is an osd (numeric)
|
||||
if (parent_node != "" && stoull_full(parent_node))
|
||||
std::set<std::string> level_seen;
|
||||
for (auto & osd: state_node_tree["osds"].array_items())
|
||||
{
|
||||
// Add it to node list if osd is in node tree
|
||||
if (node_tree.find(parent_node) != node_tree.end())
|
||||
nodes.push_back(parent_node);
|
||||
}
|
||||
// If parent node given, ...
|
||||
else if (parent_node != "")
|
||||
{
|
||||
// ... look for child nodes of this parent
|
||||
for (auto & sn: node_tree)
|
||||
// find OSD parent at <level>, but stop at <root_node>
|
||||
auto cur_id = osd.string_value();
|
||||
auto cur = state_node_tree["nodes"][cur_id];
|
||||
while (!cur.is_null())
|
||||
{
|
||||
if (sn.second["parent"] == parent_node)
|
||||
if (cur["level"] == level)
|
||||
{
|
||||
nodes.push_back(sn.first);
|
||||
|
||||
// If we're not looking for all osds, we only need a single
|
||||
// child osd node
|
||||
if (level != "osd" && stoull_full(sn.first))
|
||||
break;
|
||||
level_seen.insert(cur_id);
|
||||
break;
|
||||
}
|
||||
if (cur_id == root_node)
|
||||
break;
|
||||
cur_id = cur["parent"].string_value();
|
||||
cur = state_node_tree["nodes"][cur_id];
|
||||
}
|
||||
}
|
||||
// No parent node given, and we're not looking for all osds
|
||||
else if (level != "osd")
|
||||
{
|
||||
// ... look for all level nodes
|
||||
for (auto & sn: node_tree)
|
||||
{
|
||||
if (sn.second["level"] == level)
|
||||
{
|
||||
nodes.push_back(sn.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Otherwise, ...
|
||||
else
|
||||
{
|
||||
// ... we're looking for osd nodes only
|
||||
for (auto & sn: node_tree)
|
||||
{
|
||||
if (stoull_full(sn.first))
|
||||
{
|
||||
nodes.push_back(sn.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process gathered nodes
|
||||
for (auto & node: nodes)
|
||||
{
|
||||
// Check for osd node, return constant max size
|
||||
if (stoull_full(node))
|
||||
{
|
||||
max_pg_sz += 1;
|
||||
}
|
||||
// Otherwise, ...
|
||||
else
|
||||
{
|
||||
// ... exclude parent node from tree, and ...
|
||||
node_tree.erase(parent_node);
|
||||
|
||||
// ... descend onto the resulting tree
|
||||
max_pg_sz += get_max_pg_size(node_tree, level, node);
|
||||
}
|
||||
}
|
||||
|
||||
return max_pg_sz;
|
||||
return level_seen.size();
|
||||
}
|
||||
|
||||
json11::Json create_pool(const etcd_kv_t & kv)
|
||||
|
|
|
@ -226,6 +226,7 @@ class osd_t
|
|||
void parse_config(bool init);
|
||||
void init_cluster();
|
||||
void on_change_osd_state_hook(osd_num_t peer_osd);
|
||||
void on_change_backfillfull_hook(pool_id_t pool_id);
|
||||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
|
||||
void on_load_config_hook(json11::Json::object & changes);
|
||||
|
|
|
@ -65,6 +65,7 @@ void osd_t::init_cluster()
|
|||
st_cli.tfd = tfd;
|
||||
st_cli.log_level = log_level;
|
||||
st_cli.on_change_osd_state_hook = [this](osd_num_t peer_osd) { on_change_osd_state_hook(peer_osd); };
|
||||
st_cli.on_change_backfillfull_hook = [this](pool_id_t pool_id) { on_change_backfillfull_hook(pool_id); };
|
||||
st_cli.on_change_pg_history_hook = [this](pool_id_t pool_id, pg_num_t pg_num) { on_change_pg_history_hook(pool_id, pg_num); };
|
||||
st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_etcd_state_hook(changes); };
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
|
@ -414,6 +415,14 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
|||
}
|
||||
}
|
||||
|
||||
void osd_t::on_change_backfillfull_hook(pool_id_t pool_id)
|
||||
{
|
||||
if (!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
||||
{
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
|
||||
{
|
||||
if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())
|
||||
|
|
|
@ -252,10 +252,18 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
|||
auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
|
||||
auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
|
||||
// Restart scanning from the same PG as the last time
|
||||
restart:
|
||||
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
if ((pg_it->second.state & mask) == check)
|
||||
{
|
||||
auto pool_it = st_cli.pool_config.find(pg_it->first.pool_id);
|
||||
if (pool_it != st_cli.pool_config.end() && pool_it->second.backfillfull)
|
||||
{
|
||||
// Skip the pool
|
||||
recovery_last_pg.pool_id++;
|
||||
goto restart;
|
||||
}
|
||||
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
|
||||
assert(src.size() > 0);
|
||||
// Restart scanning from the next object
|
||||
|
|
Loading…
Reference in New Issue