Implement "inverse merge" optimisation
parent
6d307d5391
commit
4319091bd3
|
@ -6,10 +6,21 @@
|
|||
#include "base64.h"
|
||||
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
// If the requested snapshot chain has only 1 child and --writers-stopped is specified
|
||||
// then that child can be merged "down" into the snapshot chain.
|
||||
// Otherwise we iterate over all children of the chain, merge removed parents into them,
|
||||
// and delete children afterwards.
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
|
@ -18,19 +29,17 @@
|
|||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Merge <from>..<to> to <child 2>
|
||||
// 2) Set <child 2> parent to <parent>
|
||||
// 3) Variant #1, trickier, beneficial when <child 1> has less data than <to>
|
||||
// (not implemented yet):
|
||||
// - Merge <to>..<child 1> to <to>
|
||||
// - Rename <to> to <child 1>
|
||||
// It can be done without extra precautions if <child 1> is a read-only layer itself
|
||||
// Otherwise it should be either done offline or by pausing writers
|
||||
// - <to> is now deleted, repeat deletion with <from>..<layer 2>
|
||||
// 4) Variant #2, simple:
|
||||
// - Repeat 1-2 with <child 1>
|
||||
// - Delete <to>
|
||||
// 5) Process all other children
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
@ -44,13 +53,123 @@ struct snap_remover_t
|
|||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
int state = 0;
|
||||
int current_child = 0;
|
||||
std::function<bool(void)> cb;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 9;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 9)
|
||||
goto resume_9;
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
resume_2:
|
||||
while (!cb())
|
||||
{
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
parent->change_parent(merge_children[current_child], new_parent);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
resume_4:
|
||||
while (!cb())
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
resume_5:
|
||||
while (!cb())
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child metadata and rename parent over it
|
||||
rename_inverse_parent();
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb())
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 9;
|
||||
resume_9:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
|
@ -78,7 +197,6 @@ struct snap_remover_t
|
|||
}
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
std::map<inode_t,int> sources;
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
|
@ -94,67 +212,196 @@ struct snap_remover_t
|
|||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool is_done()
|
||||
void read_stats()
|
||||
{
|
||||
return state == 5;
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
for (auto cp: sources)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
exit(1);
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void loop()
|
||||
void choose_inverse_candidate()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
// Merge children one by one
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
start_merge_child();
|
||||
resume_1:
|
||||
while (!cb())
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
state = 1;
|
||||
return;
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
cb = NULL;
|
||||
parent->change_parent(merge_children[current_child], new_parent);
|
||||
state = 2;
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete sources
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
start_delete_source();
|
||||
resume_3:
|
||||
while (!cb())
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
state = 3;
|
||||
return;
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
||||
exit(1);
|
||||
}
|
||||
cb = NULL;
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
state = 4;
|
||||
resume_4:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
||||
exit(1);
|
||||
}
|
||||
state = 5;
|
||||
resume_5:
|
||||
// Done
|
||||
return;
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
json11::Json::object new_cfg = json11::Json::object {
|
||||
{ "name", child_cfg->name },
|
||||
{ "size", child_cfg->size },
|
||||
};
|
||||
if (new_parent)
|
||||
{
|
||||
if (INODE_POOL(inverse_parent) != INODE_POOL(new_parent))
|
||||
new_cfg["parent_pool"] = (uint64_t)INODE_POOL(new_parent);
|
||||
new_cfg["parent_id"] = (uint64_t)INODE_NO_POOL(new_parent);
|
||||
}
|
||||
if (child_cfg->readonly)
|
||||
{
|
||||
new_cfg["readonly"] = true;
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(new_cfg).dump()) },
|
||||
} }
|
||||
},
|
||||
} },
|
||||
}, ETCD_SLOW_TIMEOUT, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Layer %s or %s configuration was modified during renaming\n", target_name.c_str(), child_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
|
@ -184,11 +431,12 @@ resume_5:
|
|||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} }
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, ETCD_SLOW_TIMEOUT, [this, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
|
@ -200,39 +448,44 @@ resume_5:
|
|||
exit(1);
|
||||
}
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->waiting--;
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void start_merge_child()
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
{
|
||||
auto target = parent->cli->st_cli.inode_config.find(merge_children[current_child]);
|
||||
if (target == parent->cli->st_cli.inode_config.end())
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", merge_children[current_child]);
|
||||
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge", from_name, target->second.name } },
|
||||
{ "target", target->second.name },
|
||||
{ "command", json11::Json::array{ "merge", from_name, child_it->second.name } },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source()
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(chain_list[current_child]);
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", chain_list[current_child]);
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
{ "inode", chain_list[current_child] },
|
||||
{ "pool", (uint64_t)INODE_POOL(chain_list[current_child]) },
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
@ -259,6 +512,8 @@ std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
|||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover]()
|
||||
{
|
||||
snap_remover->loop();
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#define POOL_ID_BITS 16
|
||||
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
#define INODE_NO_POOL(inode) (inode_t)(inode & ((1l << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
|
||||
|
||||
// Pool ID is 16 bits long
|
||||
typedef uint32_t pool_id_t;
|
||||
|
|
Loading…
Reference in New Issue