Compare commits

...

2 Commits

Author SHA1 Message Date
Vitaliy Filippov 66c9271cbd Radically simplify create-pool pg_size check
Test / test_rebalance_verify_ec (push) Successful in 1m38s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m40s Details
Test / test_write_no_same (push) Successful in 10s Details
Test / test_write (push) Successful in 32s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m17s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m20s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m20s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m20s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 15s Details
Test / test_osd_tags (push) Successful in 10s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 16s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 16s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m18s Details
2024-11-22 01:44:14 +03:00
Vitaliy Filippov 7b37ba921d Pause pool rebalance when monitor detects that it can lead to any OSD becoming full
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m39s Details
Test / test_write_no_same (push) Successful in 9s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 37s Details
Test / test_heal_pg_size_2 (push) Successful in 2m17s Details
Test / test_heal_ec (push) Successful in 2m18s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m27s Details
Test / test_heal_csum_32k (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m19s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Failing after 11s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 15s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m19s Details
2024-11-22 01:01:07 +03:00
9 changed files with 147 additions and 128 deletions

View File

@ -56,6 +56,7 @@ const etcd_tree = {
osd_out_time: 600, // seconds. min: 0
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
use_old_pg_combinator: false,
osd_backfillfull_ratio: 0.99,
// client and osd
tcp_header_buffer_size: 65536,
use_sync_send_recv: false,

View File

@ -74,6 +74,7 @@ class Mon
this.state = JSON.parse(JSON.stringify(etcd_tree));
this.prev_stats = { osd_stats: {}, osd_diff: {} };
this.recheck_pgs_active = false;
this.updating_total_stats = false;
this.watcher_active = false;
this.old_pg_config = false;
this.old_pg_stats_seen = false;
@ -658,7 +659,13 @@ class Mon
this.etcd_watch_revision, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history);
}
new_pg_config.hash = tree_hash;
return await this.save_pg_config(new_pg_config, etcd_request);
const { backfillfull_pools } = sum_object_counts({ ...this.state, pg: { ...this.state.pg, config: new_pg_config } }, this.config);
new_pg_config.backfillfull_pools = backfillfull_pools.length ? backfillfull_pools : undefined;
if (!await this.save_pg_config(new_pg_config, etcd_request))
{
return false;
}
return true;
}
async save_pg_config(new_pg_config, etcd_request = { compare: [], success: [] })
@ -730,7 +737,7 @@ class Mon
async update_total_stats()
{
const txn = [];
const { object_counts, object_bytes } = sum_object_counts(this.state, this.config);
const { object_counts, object_bytes, backfillfull_pools } = sum_object_counts(this.state, this.config);
let stats = sum_op_stats(this.state.osd, this.prev_stats);
let { inode_stats, seen_pools } = sum_inode_stats(this.state, this.prev_stats);
stats.object_counts = object_counts;
@ -783,6 +790,16 @@ class Mon
{
await this.etcd.etcd_call('/kv/txn', { success: txn }, this.config.etcd_mon_timeout, 0);
}
if (!this.recheck_pgs_active &&
backfillfull_pools.join(',') != ((this.state.pg.config||{}).no_rebalance_pools||[]).join(','))
{
console.log(
(backfillfull_pools.length ? 'Pool(s) '+backfillfull_pools.join(', ') : 'No pools')+
' are backfillfull, applying rebalance configuration'
);
const new_pg_config = { ...this.state.pg.config, backfillfull_pools: backfillfull_pools.length ? backfillfull_pools : undefined };
await this.save_pg_config(new_pg_config);
}
}
schedule_update_stats()
@ -794,7 +811,21 @@ class Mon
this.stats_timer = setTimeout(() =>
{
this.stats_timer = null;
this.update_total_stats().catch(console.error);
if (this.updating_total_stats)
{
this.schedule_update_stats();
return;
}
this.updating_total_stats = true;
try
{
this.update_total_stats().catch(console.error);
}
catch (e)
{
console.error(e);
}
this.updating_total_stats = false;
}, this.config.mon_stats_timeout);
}

View File

@ -109,6 +109,8 @@ function sum_object_counts(state, global_config)
pgstats[pool_id] = { ...(state.pg.stats[pool_id] || {}), ...(pgstats[pool_id] || {}) };
}
}
const pool_per_osd = {};
const clean_per_osd = {};
for (const pool_id in pgstats)
{
let object_size = 0;
@ -143,10 +145,38 @@ function sum_object_counts(state, global_config)
object_bytes[k] += BigInt(st[k+'_count']) * object_size;
}
}
if (st.object_count)
{
for (const pg_osd in (((state.pg.config.items||{})[pool_id]||{})[pg_num]||{}).osd_set||[])
{
if (!(pg_osd in clean_per_osd))
{
clean_per_osd[pg_osd] = 0n;
}
clean_per_osd[pg_osd] += BigInt(st.object_count);
pool_per_osd[pg_osd] = pool_per_osd[pg_osd]||{};
pool_per_osd[pg_osd][pool_id] = true;
}
}
}
}
}
return { object_counts, object_bytes };
// If clean_per_osd[osd] is larger than osd capacity then it will fill up during rebalance
let backfillfull_pools = {};
for (const osd in clean_per_osd)
{
const st = state.osd.stats[osd];
if (st && st.size && st.data_block_size && (BigInt(st.size)/BigInt(st.data_block_size)*
BigInt((global_config.osd_backfillfull_ratio||0.99)*1000000)/1000000n) < clean_per_osd[osd])
{
for (const pool_id in pool_per_osd[osd])
{
backfillfull_pools[pool_id] = true;
}
}
}
backfillfull_pools = Object.keys(backfillfull_pools).sort();
return { object_counts, object_bytes, backfillfull_pools };
}
// sum_inode_stats(this.state, this.prev_stats)

View File

@ -785,7 +785,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
for (auto & pool_item: value.object_items())
{
pool_config_t pc;
pool_config_t pc = {};
// ID
pool_id_t pool_id;
char null_byte = 0;
@ -931,12 +931,28 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
// Ignore old key if the new one is present
return;
}
for (auto & pool_id_json: value["backfillfull_pools"].array_items())
{
auto pool_id = pool_id_json.uint64_value();
auto pool_it = this->pool_config.find(pool_id);
if (pool_it != this->pool_config.end())
{
pool_it->second.backfillfull |= 2;
}
}
for (auto & pool_item: this->pool_config)
{
for (auto & pg_item: pool_item.second.pg_config)
{
pg_item.second.config_exists = false;
}
// 3 = was 1 and became 1, 0 = was 0 and became 0
if (pool_item.second.backfillfull == 2 || pool_item.second.backfillfull == 1)
{
if (on_change_backfillfull_hook)
on_change_backfillfull_hook(pool_item.first);
}
pool_item.second.backfillfull = pool_item.second.backfillfull >> 1;
}
for (auto & pool_item: value["items"].object_items())
{

View File

@ -62,6 +62,7 @@ struct pool_config_t
std::map<pg_num_t, pg_config_t> pg_config;
uint64_t scrub_interval;
std::string used_for_fs;
int backfillfull;
};
struct inode_config_t
@ -131,6 +132,7 @@ public:
std::function<json11::Json()> load_pgs_checks_hook;
std::function<void(bool)> on_load_pgs_hook;
std::function<void()> on_change_pool_config_hook;
std::function<void(pool_id_t)> on_change_backfillfull_hook;
std::function<void(pool_id_t, pg_num_t, osd_num_t)> on_change_pg_state_hook;
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
std::function<void(osd_num_t)> on_change_osd_state_hook;

View File

@ -35,6 +35,7 @@ struct pool_creator_t
uint64_t new_pools_mod_rev;
json11::Json state_node_tree;
json11::Json new_pools;
std::map<osd_num_t, json11::Json> osd_stats;
bool is_done() { return state == 100; }
@ -46,8 +47,6 @@ struct pool_creator_t
goto resume_2;
else if (state == 3)
goto resume_3;
else if (state == 4)
goto resume_4;
else if (state == 5)
goto resume_5;
else if (state == 6)
@ -121,15 +120,15 @@ resume_2:
// Get state_node_tree based on node_placement and osd stats
{
auto node_placement_kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
std::map<osd_num_t, json11::Json> osd_stats;
timespec tv_now;
clock_gettime(CLOCK_REALTIME, &tv_now);
uint64_t osd_out_time = parent->cli->config["osd_out_time"].uint64_value();
if (!osd_out_time)
osd_out_time = 600;
osd_stats.clear();
parent->iterate_kvs_1(parent->etcd_result["responses"][1]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t cur_osd, json11::Json value)
{
if (value["time"].uint64_value()+osd_out_time >= tv_now.tv_sec)
if ((uint64_t)value["time"].number_value()+osd_out_time >= tv_now.tv_sec)
osd_stats[cur_osd] = value;
});
state_node_tree = get_state_node_tree(node_placement_kv.value.object_items(), osd_stats);
@ -175,42 +174,18 @@ resume_3:
}
}
// Get stats (for block_size, bitmap_granularity, ...) of osds in state_node_tree
{
json11::Json::array osd_stats;
for (auto osd_num: state_node_tree["osds"].array_items())
{
osd_stats.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/"+osd_num.as_string()) },
} }
});
}
parent->etcd_txn(json11::Json::object{ { "success", osd_stats } });
}
state = 4;
resume_4:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
// Filter osds from state_node_tree based on pool parameters and osd stats
{
std::vector<json11::Json> osd_stats;
for (auto & ocr: parent->etcd_result["responses"].array_items())
std::vector<json11::Json> filtered_osd_stats;
for (auto & osd_num: state_node_tree["osds"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(ocr["response_range"]["kvs"][0]);
osd_stats.push_back(kv.value);
auto st_it = osd_stats.find(osd_num.uint64_value());
if (st_it != osd_stats.end())
{
filtered_osd_stats.push_back(st_it->second);
}
}
guess_block_size(osd_stats);
guess_block_size(filtered_osd_stats);
state_node_tree = filter_state_node_tree_by_stats(state_node_tree, osd_stats);
}
@ -218,8 +193,7 @@ resume_4:
{
auto failure_domain = cfg["failure_domain"].string_value() == ""
? "host" : cfg["failure_domain"].string_value();
uint64_t max_pg_size = get_max_pg_size(state_node_tree["nodes"].object_items(),
failure_domain, cfg["root_node"].string_value());
uint64_t max_pg_size = get_max_pg_size(state_node_tree, failure_domain, cfg["root_node"].string_value());
if (cfg["pg_size"].uint64_value() > max_pg_size)
{
@ -411,13 +385,11 @@ resume_8:
};
}
// Add osd if necessary
if (node_placement.find(osd_num) == node_placement.end())
{
node_placement[osd_num] = json11::Json::object {
{ "parent", osd_host }
};
}
// Add osd
node_placement[osd_num] = json11::Json::object {
{ "parent", node_placement[osd_num]["parent"].is_null() ? osd_host : node_placement[osd_num]["parent"] },
{ "level", "osd" },
};
}
return json11::Json::object { { "osds", existing_osds }, { "nodes", node_placement } };
@ -547,15 +519,13 @@ resume_8:
// filtered out by stats parameters (block_size, bitmap_granularity) in
// given osd_stats and current pool config.
// Requires: state_node_tree["osds"] must match osd_stats 1-1
json11::Json filter_state_node_tree_by_stats(const json11::Json & state_node_tree, std::vector<json11::Json> & osd_stats)
json11::Json filter_state_node_tree_by_stats(const json11::Json & state_node_tree, std::map<osd_num_t, json11::Json> & osd_stats)
{
auto & osds = state_node_tree["osds"].array_items();
// Accepted state_node_tree nodes
auto accepted_nodes = state_node_tree["nodes"].object_items();
// List of accepted osds
std::vector<std::string> accepted_osds;
json11::Json::array accepted_osds;
block_size = cfg["block_size"].uint64_value()
? cfg["block_size"].uint64_value()
@ -567,21 +537,25 @@ resume_8:
? etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value(), IMMEDIATE_ALL)
: parent->cli->st_cli.global_immediate_commit;
for (size_t i = 0; i < osd_stats.size(); i++)
for (auto osd_num_json: state_node_tree["osds"].array_items())
{
auto & os = osd_stats[i];
// Get osd number
auto osd_num = osds[i].as_string();
auto osd_num = osd_num_json.uint64_value();
auto os_it = osd_stats.find(osd_num);
if (os_it == osd_stats.end())
{
continue;
}
auto & os = os_it->second;
if (!os["data_block_size"].is_null() && os["data_block_size"] != block_size ||
!os["bitmap_granularity"].is_null() && os["bitmap_granularity"] != bitmap_granularity ||
!os["immediate_commit"].is_null() &&
etcd_state_client_t::parse_immediate_commit(os["immediate_commit"].string_value(), IMMEDIATE_NONE) < immediate_commit)
{
accepted_nodes.erase(osd_num);
accepted_nodes.erase(osd_num_json.as_string());
}
else
{
accepted_osds.push_back(osd_num);
accepted_osds.push_back(osd_num_json);
}
}
@ -589,81 +563,28 @@ resume_8:
}
// Returns maximum pg_size possible for given node_tree and failure_domain, starting at parent_node
uint64_t get_max_pg_size(json11::Json::object node_tree, const std::string & level, const std::string & parent_node)
uint64_t get_max_pg_size(json11::Json state_node_tree, const std::string & level, const std::string & root_node)
{
uint64_t max_pg_sz = 0;
std::vector<std::string> nodes;
// Check if parent node is an osd (numeric)
if (parent_node != "" && stoull_full(parent_node))
std::set<std::string> level_seen;
for (auto & osd: state_node_tree["osds"].array_items())
{
// Add it to node list if osd is in node tree
if (node_tree.find(parent_node) != node_tree.end())
nodes.push_back(parent_node);
}
// If parent node given, ...
else if (parent_node != "")
{
// ... look for child nodes of this parent
for (auto & sn: node_tree)
// find OSD parent at <level>, but stop at <root_node>
auto cur_id = osd.string_value();
auto cur = state_node_tree["nodes"][cur_id];
while (!cur.is_null())
{
if (sn.second["parent"] == parent_node)
if (cur["level"] == level)
{
nodes.push_back(sn.first);
// If we're not looking for all osds, we only need a single
// child osd node
if (level != "osd" && stoull_full(sn.first))
break;
level_seen.insert(cur_id);
break;
}
if (cur_id == root_node)
break;
cur_id = cur["parent"].string_value();
cur = state_node_tree["nodes"][cur_id];
}
}
// No parent node given, and we're not looking for all osds
else if (level != "osd")
{
// ... look for all level nodes
for (auto & sn: node_tree)
{
if (sn.second["level"] == level)
{
nodes.push_back(sn.first);
}
}
}
// Otherwise, ...
else
{
// ... we're looking for osd nodes only
for (auto & sn: node_tree)
{
if (stoull_full(sn.first))
{
nodes.push_back(sn.first);
}
}
}
// Process gathered nodes
for (auto & node: nodes)
{
// Check for osd node, return constant max size
if (stoull_full(node))
{
max_pg_sz += 1;
}
// Otherwise, ...
else
{
// ... exclude parent node from tree, and ...
node_tree.erase(parent_node);
// ... descend onto the resulting tree
max_pg_sz += get_max_pg_size(node_tree, level, node);
}
}
return max_pg_sz;
return level_seen.size();
}
json11::Json create_pool(const etcd_kv_t & kv)

View File

@ -226,6 +226,7 @@ class osd_t
void parse_config(bool init);
void init_cluster();
void on_change_osd_state_hook(osd_num_t peer_osd);
void on_change_backfillfull_hook(pool_id_t pool_id);
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
void on_load_config_hook(json11::Json::object & changes);

View File

@ -65,6 +65,7 @@ void osd_t::init_cluster()
st_cli.tfd = tfd;
st_cli.log_level = log_level;
st_cli.on_change_osd_state_hook = [this](osd_num_t peer_osd) { on_change_osd_state_hook(peer_osd); };
st_cli.on_change_backfillfull_hook = [this](pool_id_t pool_id) { on_change_backfillfull_hook(pool_id); };
st_cli.on_change_pg_history_hook = [this](pool_id_t pool_id, pg_num_t pg_num) { on_change_pg_history_hook(pool_id, pg_num); };
st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_etcd_state_hook(changes); };
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@ -414,6 +415,14 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
}
}
void osd_t::on_change_backfillfull_hook(pool_id_t pool_id)
{
if (!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
{
peering_state = peering_state | OSD_RECOVERING;
}
}
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
{
if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())

View File

@ -252,10 +252,18 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
// Restart scanning from the same PG as the last time
restart:
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
{
if ((pg_it->second.state & mask) == check)
{
auto pool_it = st_cli.pool_config.find(pg_it->first.pool_id);
if (pool_it != st_cli.pool_config.end() && pool_it->second.backfillfull)
{
// Skip the pool
recovery_last_pg.pool_id++;
goto restart;
}
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
assert(src.size() > 0);
// Restart scanning from the next object