/* ========================================================================= Copyright (c) 2023 MIND Software LLC. All Rights Reserved. This file is part of the Software-Defined Storage MIND UStor Project. For more information about this product, please visit https://mindsw.io or contact us directly at info@mindsw.io ========================================================================= */ #include #include "cli.h" #include "cli_pool_cfg.h" #include "cluster_client.h" #include "epoll_manager.h" #include "pg_states.h" #include "str_util.h" struct pool_creator_t { cli_tool_t *parent; bool force; pool_configurator_t *cfg; int state = 0; cli_result_t result; struct { uint32_t retries = 5; uint32_t interval = 0; bool passed = false; } create_check; uint64_t new_id = 1; uint64_t new_pools_mod_rev; json11::Json state_node_tree; json11::Json new_pools; json11::Json osd_tags_json; json11::Json primary_affinity_tags_json; bool is_done() { return state == 100; } void loop() { if (state == 1) goto resume_1; else if (state == 2) goto resume_2; else if (state == 3) goto resume_3; else if (state == 4) goto resume_4; else if (state == 5) goto resume_5; else if (state == 6) goto resume_6; else if (state == 7) goto resume_7; else if (state == 8) goto resume_8; // Validate pool parameters if (!cfg->validate(parent->cli->st_cli, NULL, !force)) { result = (cli_result_t){ .err = EINVAL, .text = cfg->get_error_string() + "\n" }; state = 100; return; } // OSD tags if (cfg->osd_tags != "") { osd_tags_json = parent->parse_tags(cfg->osd_tags); } // Primary affinity tags if (cfg->primary_affinity_tags != "") { primary_affinity_tags_json = parent->parse_tags(cfg->primary_affinity_tags); } state = 1; resume_1: // If not forced, check that we have enough osds for pg_size if (!force) { // Get node_placement configuration from etcd parent->etcd_txn(json11::Json::object { { "success", json11::Json::array { json11::Json::object { { "request_range", json11::Json::object { { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/node_placement") }, } } }, } }, }); state = 2; resume_2: if (parent->waiting > 0) return; if (parent->etcd_err.err) { result = parent->etcd_err; state = 100; return; } // Get state_node_tree based on node_placement and osd peer states { auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]); state_node_tree = get_state_node_tree(kv.value.object_items()); } // Skip tag checks, if pool has none if (!osd_tags_json.is_null()) { // Get osd configs (for tags) of osds in state_node_tree { json11::Json::array osd_configs; for (auto osd_num: state_node_tree["osds"].array_items()) { osd_configs.push_back(json11::Json::object { { "request_range", json11::Json::object { { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+osd_num.as_string()) }, } } }); } parent->etcd_txn(json11::Json::object { { "success", osd_configs, }, }); } state = 3; resume_3: if (parent->waiting > 0) return; if (parent->etcd_err.err) { result = parent->etcd_err; state = 100; return; } // Filter out osds from state_node_tree based on pool/osd tags { std::vector osd_configs; for (auto & ocr: parent->etcd_result["responses"].array_items()) { auto kv = parent->cli->st_cli.parse_etcd_kv(ocr["response_range"]["kvs"][0]); osd_configs.push_back(kv.value); } state_node_tree = filter_state_node_tree_by_tags(state_node_tree, osd_configs); } } // Get stats (for block_size, bitmap_granularity, ...) of osds in state_node_tree { json11::Json::array osd_stats; for (auto osd_num: state_node_tree["osds"].array_items()) { osd_stats.push_back(json11::Json::object { { "request_range", json11::Json::object { { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/"+osd_num.as_string()) }, } } }); } parent->etcd_txn(json11::Json::object { { "success", osd_stats, }, }); } state = 4; resume_4: if (parent->waiting > 0) return; if (parent->etcd_err.err) { result = parent->etcd_err; state = 100; return; } // Filter osds from state_node_tree based on pool parameters and osd stats { std::vector osd_stats; for (auto & ocr: parent->etcd_result["responses"].array_items()) { auto kv = parent->cli->st_cli.parse_etcd_kv(ocr["response_range"]["kvs"][0]); osd_stats.push_back(kv.value); } state_node_tree = filter_state_node_tree_by_stats(state_node_tree, osd_stats); } // Check that pg_size <= max_pg_size { uint64_t max_pg_size = get_max_pg_size(state_node_tree["nodes"].object_items(), cfg->failure_domain, cfg->root_node); if (cfg->pg_size > max_pg_size) { result = (cli_result_t){ .err = EINVAL, .text = "There are "+std::to_string(max_pg_size)+" failure domains with OSDs matching tags and " "block_size/bitmap_granularity/immediate_commit parameters, but you want to create a " "pool with "+std::to_string(cfg->pg_size)+" OSDs from different failure domains in a PG. " "Change parameters or add --force if you want to create a degraded pool and add OSDs later." }; state = 100; return; } } } // Create pool state = 5; resume_5: // Get pools from etcd parent->etcd_txn(json11::Json::object { { "success", json11::Json::array { json11::Json::object { { "request_range", json11::Json::object { { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pools") }, } } }, } }, }); state = 6; resume_6: if (parent->waiting > 0) return; if (parent->etcd_err.err) { result = parent->etcd_err; state = 100; return; } { // Add new pool auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]); new_pools = create_pool(kv); if (new_pools.is_string()) { result = (cli_result_t){ .err = EEXIST, .text = new_pools.string_value() }; state = 100; return; } new_pools_mod_rev = kv.mod_revision; } // Update pools in etcd parent->etcd_txn(json11::Json::object { { "compare", json11::Json::array { json11::Json::object { { "target", "MOD" }, { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pools") }, { "result", "LESS" }, { "mod_revision", new_pools_mod_rev+1 }, } } }, { "success", json11::Json::array { json11::Json::object { { "request_put", json11::Json::object { { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pools") }, { "value", base64_encode(new_pools.dump()) }, } }, }, } }, }); state = 7; resume_7: if (parent->waiting > 0) return; if (parent->etcd_err.err) { result = parent->etcd_err; state = 100; return; } // Perform final create-check create_check.interval = parent->cli->config["mon_change_timeout"].uint64_value(); if (!create_check.interval) create_check.interval = 1000; state = 8; resume_8: if (parent->waiting > 0) return; // Unless forced, check that pool was created and is active if (force) create_check.passed = true; else if (create_check.retries) { create_check.retries--; parent->waiting++; parent->epmgr->tfd->set_timer(create_check.interval, false, [this](int timer_id) { if (parent->cli->st_cli.pool_config.find(new_id) != parent->cli->st_cli.pool_config.end()) { auto & pool_cfg = parent->cli->st_cli.pool_config[new_id]; create_check.passed = pool_cfg.real_pg_count > 0; for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++) { if (!(pg_it->second.cur_state & PG_ACTIVE)) { create_check.passed = false; break; } } if (create_check.passed) create_check.retries = 0; } parent->waiting--; parent->ringloop->wakeup(); }); return; } if (!create_check.passed) { result = (cli_result_t) { .err = EAGAIN, .text = "Pool "+cfg->name+" was created, but failed to become active. This may indicate that cluster " "state has changed while the pool was being created. Please check the current state and " "correct the pool's configuration if necessary.\n" }; } else { result = (cli_result_t){ .err = 0, .text = "Pool "+cfg->name+" created", .data = new_pools[std::to_string(new_id)] }; } state = 100; } // Returns a JSON object of form {"nodes": {...}, "osds": [...]} that // contains: all nodes (osds, hosts, ...) based on node_placement config // and current peer state, and a list of active peer osds. json11::Json get_state_node_tree(json11::Json::object node_placement) { // Erase non-peer osd nodes from node_placement for (auto np_it = node_placement.begin(); np_it != node_placement.end();) { // Numeric nodes are osds osd_num_t osd_num = stoull_full(np_it->first); // If node is osd and it is not in peer states, erase it if (osd_num > 0 && parent->cli->st_cli.peer_states.find(osd_num) == parent->cli->st_cli.peer_states.end()) { node_placement.erase(np_it++); } else np_it++; } // List of peer osds std::vector peer_osds; // Record peer osds and add missing osds/hosts to np for (auto & ps: parent->cli->st_cli.peer_states) { std::string osd_num = std::to_string(ps.first); // Record peer osd peer_osds.push_back(osd_num); // Add osd, if necessary if (node_placement.find(osd_num) == node_placement.end()) { std::string osd_host = ps.second["host"].as_string(); // Add host, if necessary if (node_placement.find(osd_host) == node_placement.end()) { node_placement[osd_host] = json11::Json::object { { "level", "host" } }; } node_placement[osd_num] = json11::Json::object { { "parent", osd_host } }; } } return json11::Json::object { { "osds", peer_osds }, { "nodes", node_placement } }; } // Returns new state_node_tree based on given state_node_tree with osds // filtered out by tags in given osd_configs and current pool config. // Requires: state_node_tree["osds"] must match osd_configs 1-1 json11::Json filter_state_node_tree_by_tags(const json11::Json & state_node_tree, std::vector & osd_configs) { auto & osds = state_node_tree["osds"].array_items(); // Accepted state_node_tree nodes auto accepted_nodes = state_node_tree["nodes"].object_items(); // List of accepted osds std::vector accepted_osds; for (size_t i = 0; i < osd_configs.size(); i++) { auto & oc = osd_configs[i].object_items(); // Get osd number auto osd_num = osds[i].as_string(); // We need tags in config to check against pool tags if (oc.find("tags") == oc.end()) { // Exclude osd from state_node_tree nodes accepted_nodes.erase(osd_num); continue; } else { // If all pool tags are in osd tags, accept osd if (all_in_tags(osd_configs[i]["tags"], osd_tags_json)) { accepted_osds.push_back(osd_num); } // Otherwise, exclude osd else { // Exclude osd from state_node_tree nodes accepted_nodes.erase(osd_num); } } } return json11::Json::object { { "osds", accepted_osds }, { "nodes", accepted_nodes } }; } // Returns new state_node_tree based on given state_node_tree with osds // filtered out by stats parameters (block_size, bitmap_granularity) in // given osd_stats and current pool config. // Requires: state_node_tree["osds"] must match osd_stats 1-1 json11::Json filter_state_node_tree_by_stats(const json11::Json & state_node_tree, std::vector & osd_stats) { auto & osds = state_node_tree["osds"].array_items(); // Accepted state_node_tree nodes auto accepted_nodes = state_node_tree["nodes"].object_items(); // List of accepted osds std::vector accepted_osds; for (size_t i = 0; i < osd_stats.size(); i++) { auto & os = osd_stats[i].object_items(); // Get osd number auto osd_num = osds[i].as_string(); // Check data_block_size if (os.find("data_block_size") != os.end()) { uint64_t p_block_size = cfg->block_size ? cfg->block_size : parent->cli->st_cli.global_block_size; uint64_t o_block_size = osd_stats[i]["data_block_size"].int64_value(); if (p_block_size != o_block_size) { accepted_nodes.erase(osd_num); continue; } } // Check bitmap_granularity if (os.find("bitmap_granularity") != os.end()) { uint64_t p_bitmap_granularity = cfg->bitmap_granularity ? cfg->bitmap_granularity : parent->cli->st_cli.global_bitmap_granularity; uint64_t o_bitmap_granularity = osd_stats[i]["bitmap_granularity"].int64_value(); if (p_bitmap_granularity != o_bitmap_granularity) { accepted_nodes.erase(osd_num); continue; } } // Check immediate_commit if (os.find("immediate_commit") != os.end()) { uint32_t p_immediate_commit = (cfg->immediate_commit != "") ? parent->cli->st_cli.parse_immediate_commit_string(cfg->immediate_commit) : parent->cli->st_cli.global_immediate_commit; uint32_t o_immediate_commit = parent->cli->st_cli.parse_immediate_commit_string(osd_stats[i]["immediate_commit"].string_value()); if (o_immediate_commit < p_immediate_commit) { accepted_nodes.erase(osd_num); continue; } } // Accept osd if all checks passed accepted_osds.push_back(osd_num); } return json11::Json::object { { "osds", accepted_osds }, { "nodes", accepted_nodes } }; } // Returns maximum pg_size possible for given node_tree and failure_domain, starting at parent_node uint64_t get_max_pg_size(json11::Json::object node_tree, const std::string & failure_domain = "", const std::string & parent_node = "") { uint64_t max_pg_sz = 0; std::vector nodes; const std::string level = (failure_domain != "") ? failure_domain : "osd"; // Check if parnet node is an osd (numeric) if (parent_node != "" && stoull_full(parent_node)) { // Add it to node list if osd is in node tree if (node_tree.find(parent_node) != node_tree.end()) nodes.push_back(parent_node); } // If parent node given, ... else if (parent_node != "") { // ... look for children nodes of this parent for (auto & sn: node_tree) { auto & props = sn.second.object_items(); auto parent_prop = props.find("parent"); if (parent_prop != props.end() && (parent_prop->second.as_string() == parent_node)) { nodes.push_back(sn.first); // If we're not looking for all osds, we only need a single // child osd node if (level != "osd" && stoull_full(sn.first)) break; } } } // No parent node given, and we're not looking for all osds else if (level != "osd") { // ... look for all level nodes for (auto & sn: node_tree) { auto & props = sn.second.object_items(); auto level_prop = props.find("level"); if (level_prop != props.end() && (level_prop->second.as_string() == level)) { nodes.push_back(sn.first); } } } // Otherwise, ... else { // ... we're looking for osd nodes only for (auto & sn: node_tree) { if (stoull_full(sn.first)) { nodes.push_back(sn.first); } } } // Process gathered nodes for (auto & node: nodes) { // Check for osd node, return constant max size if (stoull_full(node)) { max_pg_sz += 1; } // Otherwise, ... else { // ... exclude parent node from tree, and ... node_tree.erase(parent_node); // ... descend onto the resulting tree max_pg_sz += get_max_pg_size(node_tree, level, node); } } return max_pg_sz; } json11::Json create_pool(const etcd_kv_t & kv) { for (auto & p: kv.value.object_items()) { // ID uint64_t pool_id; char null_byte = 0; sscanf(p.first.c_str(), "%lu%c", &pool_id, &null_byte); new_id = std::max(pool_id+1, new_id); // Name if (p.second["name"].string_value() == cfg->name) { return json11::Json("Pool "+std::to_string(pool_id)+" has the same name\n"); } } json11::Json::object new_pool = json11::Json::object { { "name", cfg->name }, { "scheme", cfg->scheme }, { "pg_size", cfg->pg_size }, { "pg_minsize", cfg->pg_minsize }, { "pg_count", cfg->pg_count }, { "parity_chunks", cfg->parity_chunks }, }; if (cfg->failure_domain != "") new_pool["failure_domain"] = cfg->failure_domain; if (cfg->max_osd_combinations) new_pool["max_osd_combinations"] = cfg->max_osd_combinations; if (cfg->block_size) new_pool["block_size"] = cfg->block_size; if (cfg->bitmap_granularity) new_pool["bitmap_granularity"] = cfg->bitmap_granularity; if (cfg->immediate_commit != "") new_pool["immediate_commit"] = cfg->immediate_commit; if (cfg->pg_stripe_size) new_pool["pg_stripe_size"] = cfg->pg_stripe_size; if (cfg->root_node != "") new_pool["root_node"] = cfg->root_node; if (cfg->scrub_interval != "") new_pool["scrub_interval"] = cfg->scrub_interval; if (cfg->osd_tags != "") new_pool["osd_tags"] = osd_tags_json; if (cfg->primary_affinity_tags != "") new_pool["primary_affinity_tags"] = primary_affinity_tags_json; auto res = kv.value.object_items(); res[std::to_string(new_id)] = new_pool; return res; } // Checks whether tags2 tags are all in tags1 tags bool all_in_tags(json11::Json tags1, json11::Json tags2) { if (!tags2.is_array()) { tags2 = json11::Json::array{ tags2.string_value() }; } if (!tags1.is_array()) { tags1 = json11::Json::array{ tags1.string_value() }; } for (auto & tag2: tags2.array_items()) { bool found = false; for (auto & tag1: tags1.array_items()) { if (tag1 == tag2) { found = true; break; } } if (!found) { return false; } } return true; } }; std::function cli_tool_t::start_pool_create(json11::Json cfg) { auto pool_creator = new pool_creator_t(); pool_creator->parent = this; pool_creator->cfg = new pool_configurator_t(); if (!pool_creator->cfg->parse(cfg, true)) { std::string err = pool_creator->cfg->get_error_string(); return [err](cli_result_t & result) { result = (cli_result_t){ .err = EINVAL, .text = err + "\n" }; return true; }; } pool_creator->force = !cfg["force"].is_null(); return [pool_creator](cli_result_t & result) { pool_creator->loop(); if (pool_creator->is_done()) { result = pool_creator->result; delete pool_creator; return true; } return false; }; }