1
0
Fork 0

Compare commits

...

4 Commits

5 changed files with 270 additions and 65 deletions

View File

@ -50,7 +50,8 @@ async function lp_solve(text)
return { score, vars };
}
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, hier_sizes = null,
max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
{
if (!pg_count || !osd_tree)
{
@ -58,7 +59,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
}
const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
const all_pgs = Object.values(random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout));
const pg_per_osd = {};
for (const pg of all_pgs)
{
@ -216,39 +217,45 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
return move_weights;
}
function add_valid_previous(osd_tree, prev_weights, all_pgs)
function build_parent_per_leaf(osd_tree, res = {}, parents = [])
{
for (const item in osd_tree)
{
if (osd_tree[item] instanceof Object)
build_parent_per_leaf(osd_tree[item], res, [ ...parents, item ]);
else
res[item] = parents;
}
return res;
}
function add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes)
{
// Add previous combinations that are still valid
const hosts = Object.keys(osd_tree).sort();
const host_per_osd = {};
for (const host in osd_tree)
{
for (const osd in osd_tree[host])
{
host_per_osd[osd] = host;
}
}
const parent_per_osd = build_parent_per_leaf(osd_tree);
skip_pg: for (const pg_name in prev_weights)
{
const seen_hosts = {};
const seen = [];
const pg = pg_name.substr(3).split(/_/);
for (const osd of pg)
{
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
if (!parent_per_osd[osd])
continue skip_pg;
for (let i = 0; i < parent_per_osd[osd].length; i++)
{
seen[parent_per_osd[osd][i]]++;
if (seen[parent_per_osd[osd][i]] > hier_sizes[i])
continue skip_pg;
}
seen_hosts[host_per_osd[osd]] = true;
}
if (!all_pgs[pg_name])
{
all_pgs[pg_name] = pg;
}
}
}
// Try to minimize data movement
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2,
hier_sizes = null, max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
{
if (!osd_tree)
{
@ -273,10 +280,10 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
}
const old_pg_size = prev_int_pgs[0].length;
// Get all combinations
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
let all_pgs = random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout);
if (old_pg_size == pg_size)
{
add_valid_previous(osd_tree, prev_weights, all_pgs);
add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes || [ pg_size, 1 ]);
}
all_pgs = Object.values(all_pgs);
const pg_per_osd = {};
@ -502,41 +509,147 @@ function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
}
}
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
// Convert multi-level tree_node = { level: number|string, id?: string, size?: number, children?: tree_node[] }
// levels = { string: number }
// to a two-level osd_tree suitable for all_combinations()
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
// to a multi-level OSD tree suitable for random_hier_combinations()
// (or in case of just 2 levels - for all_combinations() / random_combinations())
//
// Example:
// tree_node = { level: 'dc', children: [ { level: 'rack', children: [ { level: 'host', children: [ { level: 'osd', size: 10 } ] } ] } ] }
// extract_levels = [ 'rack', 'osd' ]
// level_defs = { dc: 1, rack: 2, host: 3, osd: 4 }
//
// Result:
// { rack0: { osd1: 10 } }
function extract_tree_levels(tree_node, extract_levels, level_defs, new_tree = { idx: 1, items: {} })
{
osd_level = levels[osd_level] || osd_level;
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
for (const node of osd_tree)
const next_level = Number(level_defs[extract_levels[0]] || extract_levels[0]) || 0;
const level_name = level_defs[extract_levels[0]] ? extract_levels[0] : 'l'+extract_levels[0]+'_';
const is_leaf = extract_levels.length == 1;
if ((level_defs[tree_node.level] || tree_node.level) >= next_level)
{
if ((levels[node.level] || node.level) < failure_domain_level)
if (!is_leaf)
{
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
// Insert a (possibly fake) level
const nt = { idx: 1, items: {} };
new_tree.items[level_name+(new_tree.idx++)] = nt.items;
extract_tree_levels(tree_node, extract_levels.slice(1), level_defs, nt);
}
else
{
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
// Insert a leaf node
const leaf_id = tree_node.id || (level_name+(new_tree.idx++));
new_tree.items[leaf_id] = tree_node.size;
}
}
return domains;
else
{
for (const child_node of tree_node.children||[])
{
extract_tree_levels(child_node, extract_levels, level_defs, new_tree);
}
}
return new_tree.items;
}
function extract_osds(osd_tree, levels, osd_level, osds = {})
// generate random PGs with hierarchical failure domains, i.e. for example 3 DC each with 2 HOSTS
// osd_tree = { level3_id: { level2_id: { level1_id: scalar_value } }, ... }
// osd_tree may contain arbitrary number of levels, but level count must be the same across the whole tree
// size_per_level = number of items to select on each level, for example [3, 2, 1].
// must have the same number of items as the osd_tree level count.
// count = PG count to generate
// ordered = don't treat (x,y) and (y,x) as equal
// seq_layout = true for the [DC1,DC1,DC2,DC2,DC3,DC3] layout, false for [DC1,DC2,DC3,DC1,DC2,DC3] layout
function random_hier_combinations(osd_tree, size_per_level, count, ordered, seq_layout)
{
for (const node of osd_tree)
let seed = 0x5f020e43;
const rng = () =>
{
if ((levels[node.level] || node.level) >= osd_level)
seed ^= seed << 13;
seed ^= seed >> 17;
seed ^= seed << 5;
return seed + 2147483648;
};
const get_max_level = (o) =>
{
osds[node.id] = node.size;
let lvl = 0;
while (o instanceof Object)
{
for (const k in o)
{
lvl++;
o = o[k];
break;
}
}
return lvl;
};
const max_level = get_max_level(osd_tree);
const gen_pg = (select) =>
{
let pg = [ osd_tree ];
for (let level = 0; level < max_level; level++)
{
let npg = [];
for (let i = 0; i < pg.length; i++)
{
const keys = pg[i] instanceof Object ? Object.keys(pg[i]) : [];
const max_keys = keys.length < size_per_level[level] ? keys.length : size_per_level[level];
for (let j = 0; j < max_keys; j++)
{
const r = select(level, i, j, (ordered ? keys.length : (keys.length - (max_keys - j - 1))));
const el = pg[i][keys[r]] instanceof Object ? pg[i][keys[r]] : keys[r];
npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = el;
keys.splice(ordered ? r : 0, ordered ? 1 : (r+1));
}
for (let j = max_keys; j < size_per_level[level]; j++)
npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = NO_OSD;
}
pg = npg;
}
return pg;
};
const r = {};
// Generate random combinations including each OSD at least once
let has_next = true;
let ctr = [];
while (has_next)
{
let pg = gen_pg((level, i, j, n) =>
{
if (i == 0 && j == 0)
{
// Select a pre-determined OSD in the first position on each level
const r = ctr[level] == null || ctr[level][1] != n ? 0 : ctr[level][0];
ctr[level] = [ r, n ];
return r;
}
return rng() % n;
});
for (let i = ctr.length-1; i >= 0; i--)
{
ctr[i][0]++;
if (ctr[i][0] < ctr[i][1])
break;
else
ctr[i] = null;
}
has_next = ctr[0] != null;
const cyclic_pgs = [ pg ];
if (ordered)
for (let i = 1; i < pg.size; i++)
cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
for (const pg of cyclic_pgs)
r['pg_'+pg.join('_')] = pg;
}
// Generate purely random combinations
while (count > 0)
{
extract_osds(node.children||[], levels, osd_level, osds);
let pg = gen_pg((l, i, j, n) => rng() % n);
r['pg_'+pg.join('_')] = pg;
count--;
}
}
return osds;
return r;
}
// ordered = don't treat (x,y) and (y,x) as equal
@ -752,11 +865,12 @@ module.exports = {
pg_weights_space_efficiency,
pg_list_space_efficiency,
pg_per_osd_space_efficiency,
flatten_tree,
extract_tree_levels,
lp_solve,
make_int_pgs,
align_pgs,
random_combinations,
random_hier_combinations,
all_combinations,
};

View File

@ -159,6 +159,10 @@ const etcd_tree = {
// number of parity chunks, required for EC
parity_chunks?: 1,
pg_count: 100,
// failure_domain = string | { string: int }
// the second case specifies multiple failure domains. example:
// { datacenter: 3, host: 2 } - means 3 datacenters with 2 hosts each, for EC 4+2
// guarantees availability on outage of either 1 datacenter or 2 hosts
failure_domain: 'host',
max_osd_combinations: 10000,
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
@ -1027,6 +1031,32 @@ class Mon
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
if (pool_cfg.failure_domain instanceof Object)
{
for (const key in pool_cfg.failure_domain)
{
const cnt = parseInt(pool_cfg.failure_domain[key]);
if (!cnt || cnt <= 0)
{
if (warn)
console.log('Pool '+pool_id+' specifies invalid item count for failure domain \"'+key+'\"');
return false;
}
if (key !== 'host' && key != 'osd' && !(key in this.config.placement_levels||{}))
{
if (warn)
console.log('Pool '+pool_id+' uses invalid failure domain \"'+key+'\"');
return false;
}
}
}
else if (pool_cfg.failure_domain !== 'host' && pool_cfg.failure_domain != 'osd' &&
!(pool_cfg.failure_domain in this.config.placement_levels||{}))
{
if (warn)
console.log('Pool '+pool_id+' uses invalid failure domain \"'+pool_cfg.failure_domain+'\"');
return false;
}
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
if (!/^[1-9]\d*$/.exec(''+pool_id))
{
@ -1112,27 +1142,23 @@ class Mon
filter_osds_by_tags(orig_tree, flat_tree, tags)
{
if (!tags)
{
return;
}
return 1;
for (const tag of (tags instanceof Array ? tags : [ tags ]))
{
for (const host in flat_tree)
for (const item in flat_tree)
{
let found = 0;
for (const osd in flat_tree[host])
if (flat_tree[item] instanceof Object)
{
if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
delete flat_tree[host][osd];
else
found++;
}
if (!found)
{
delete flat_tree[host];
if (!filter_osds_by_tags(orig_tree, flat_tree[item], tags))
delete flat_tree[item];
}
else if (!orig_tree[item].tags || !orig_tree[item].tags[tag])
delete flat_tree[item];
}
}
for (const item in flat_tree)
return 1;
return 0;
}
get_affinity_osds(pool_cfg, up_osds, osd_tree)
@ -1191,9 +1217,11 @@ class Mon
{
continue;
}
let pool_tree = osd_tree[pool_cfg.root_node || ''];
pool_tree = pool_tree ? pool_tree.children : [];
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
let pool_tree = osd_tree[pool_cfg.root_node || ''] || {};
const failure_domains = pool_cfg.failure_domain instanceof Object
? [ ...Object.keys(pool_cfg.failure_domain), 'osd' ]
: [ pool_cfg.failure_domain, 'osd' ];
pool_tree = LPOptimizer.extract_tree_levels(pool_tree, failure_domains, levels);
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
// These are for the purpose of building history.osd_sets
const real_prev_pgs = [];
@ -1220,6 +1248,9 @@ class Mon
pg_count: pool_cfg.pg_count,
pg_size: pool_cfg.pg_size,
pg_minsize: pool_cfg.pg_minsize,
hier_sizes: pool_cfg.failure_domain instanceof Object
? [ ...Object.values(pool_cfg.failure_domain), 1 ]
: null,
max_combinations: pool_cfg.max_osd_combinations,
ordered: pool_cfg.scheme != 'replicated',
};
@ -1275,7 +1306,7 @@ class Mon
} });
}
LPOptimizer.print_change_stats(optimize_result);
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length); // FIXME requires hier support too
this.state.pool.stats[pool_id] = {
used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
total_raw_tb: optimize_result.space,

View File

@ -36,7 +36,7 @@ const crush_tree = [
] },
];
const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
const osd_tree = LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {});
console.log(osd_tree);
async function run()
@ -47,32 +47,32 @@ async function run()
LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);
console.log('\nAdding 1st failure domain:');
cur_tree['dom1'] = osd_tree['dom1'];
cur_tree['l1_1'] = osd_tree['l1_1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nAdding 2nd failure domain:');
cur_tree['dom2'] = osd_tree['dom2'];
cur_tree['l1_2'] = osd_tree['l1_2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nAdding 3rd failure domain:');
cur_tree['dom3'] = osd_tree['dom3'];
cur_tree['l1_3'] = osd_tree['l1_3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 36 && res.total_space == 36);
console.log('\nRemoving 3rd failure domain:');
delete cur_tree['dom3'];
delete cur_tree['l1_3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nRemoving 2nd failure domain:');
delete cur_tree['dom2'];
delete cur_tree['l1_2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nRemoving 1st failure domain:');
delete cur_tree['dom1'];
delete cur_tree['l1_1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);

View File

@ -108,7 +108,11 @@ async function run()
LPOptimizer.print_change_stats(res, false);
console.log('\n256 PGs, size=3, failure domain=rack');
res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 });
res = await LPOptimizer.optimize_initial({
osd_tree: LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {}),
pg_size: 3,
pg_count: 256,
});
LPOptimizer.print_change_stats(res, false);
}

56
mon/test-random-hier.js Normal file
View File

@ -0,0 +1,56 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = {
100: { 110: { 111: 1, 112: 1 }, 120: { 121: 1, 122: 1 } },
200: { 210: { 211: 1, 212: 1 }, 220: { 221: 1, 222: 1 } },
300: { 310: { 311: 1, 312: 1 }, 320: { 321: 1, 322: 1 } },
400: { 410: { 411: 1, 412: 1 }, 420: { 421: 1, 422: 1 } },
500: { 510: { 511: 1, 512: 1 }, 520: { 521: 1, 522: 1 } },
};
const osd_tree2 = {
100: { 111: 1, 112: 1, 121: 1, 122: 1 },
200: { 211: 1, 212: 1, 221: 1, 222: 1 },
300: { 311: 1, 312: 1, 321: 1, 322: 1 },
400: { 411: 1, 412: 1, 421: 1, 422: 1 },
500: { 511: 1, 512: 1, 521: 1, 522: 1 },
};
const osd_tree3 = {
100: { 111: 1, 112: 1, 121: 1, 122: 1 },
200: { 211: 1, 212: 1, 221: 1, 222: 1 },
300: { 311: 1, 312: 1, 321: 1, 322: 1 },
400: { 411: 1, 412: 1, 421: 1, 422: 1 },
500: { 511: 1 },
};
async function run()
{
let r;
console.log(r = LPOptimizer.random_hier_combinations(osd_tree, [ 3, 2, 1 ], 10000, false, true));
console.log(r = LPOptimizer.random_hier_combinations(osd_tree2, [ 3, 2 ], 0, false, true));
// Will contain 'Z':
console.log(r = LPOptimizer.random_combinations(osd_tree2, 6, 0, true));
console.log(r = LPOptimizer.extract_tree_levels(
{ level: 'dc', children: [
{ level: 'rack', children: [
{ level: 'host', children: [
{ level: 'osd', id: 'OSD5', size: 10 },
] },
] },
{ level: 'osd', id: 'OSD10', size: 10 },
] },
[ 'rack', 'osd' ],
{ dc: 1, rack: 2, host: 3, osd: 4 }
));
if (JSON.stringify(r) != '{"rack1":{"OSD5":10},"rack2":{"OSD10":10}}')
throw new Error('extract_tree_levels failed');
// should not contain Z:
console.log(r = LPOptimizer.random_hier_combinations(osd_tree3, [ 3, 2 ], 0, false, true));
console.log('OK');
}
run().catch(console.error);