From a86788fe3b8cbd548255f336b3c2c4582c3a8a67 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 16 Aug 2020 21:54:55 +0300 Subject: [PATCH] Support optimizing for the case when parity chunks occupy more space than data chunks Mostly as an experiment because the problem solved by this commit comes from Ceph's EC+compression implementation details and I'm not sure if my implementation will be the same --- lp/lp-optimizer.js | 69 ++++++++++++------ lp/mon.js | 14 +++- lp/test-nonuniform.js | 127 +++++++++++++++++++++++++++++++++ lp/test-optimize-undersized.js | 14 ++-- lp/test-optimize.js | 14 ++-- 5 files changed, 199 insertions(+), 39 deletions(-) create mode 100644 lp/test-nonuniform.js diff --git a/lp/lp-optimizer.js b/lp/lp-optimizer.js index 5ed34fe6d..391f28d93 100644 --- a/lp/lp-optimizer.js +++ b/lp/lp-optimizer.js @@ -25,7 +25,7 @@ async function lp_solve(text) let vars = {}; for (const line of stdout.split(/\n/)) { - let m = /^(^Value of objective function: ([\d\.]+)|Actual values of the variables:)\s*$/.exec(line); + let m = /^(^Value of objective function: (-?[\d\.]+)|Actual values of the variables:)\s*$/.exec(line); if (m) { if (m[2]) @@ -47,22 +47,27 @@ async function lp_solve(text) return { score, vars }; } -async function optimize_initial(osd_tree, pg_size, pg_count, max_combinations) +async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1 }) { - max_combinations = max_combinations || 10000; + if (!pg_count || !osd_tree) + { + return null; + } const all_weights = Object.assign({}, ...Object.values(osd_tree)); const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0); all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations)); const pg_per_osd = {}; for (const pg of all_pgs) { - for (const osd of pg) + for (let i = 0; i < pg.length; i++) { + const osd = pg[i]; pg_per_osd[osd] = pg_per_osd[osd] || []; - pg_per_osd[osd].push("pg_"+pg.join("_")); + pg_per_osd[osd].push((i >= pg_minsize ? parity_space+'*' : '')+"pg_"+pg.join("_")); } } - const pg_effsize = Math.min(Object.keys(osd_tree).length, pg_size); + const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length) + + Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space; let lp = ''; lp += "max: "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(' + ')+";\n"; for (const osd in pg_per_osd) @@ -81,11 +86,19 @@ async function optimize_initial(osd_tree, pg_size, pg_count, max_combinations) const lp_result = await lp_solve(lp); if (!lp_result) { + console.log(lp); throw new Error('Problem is infeasible or unbounded - is it a bug?'); } const int_pgs = make_int_pgs(lp_result.vars, pg_count); - const eff = pg_list_space_efficiency(int_pgs, all_weights); - return { score: lp_result.score, weights: lp_result.vars, int_pgs, space: eff*pg_effsize, total_space: total_weight }; + const eff = pg_list_space_efficiency(int_pgs, all_weights, pg_minsize, parity_space); + const res = { + score: lp_result.score, + weights: lp_result.vars, + int_pgs, + space: eff * pg_effsize, + total_space: total_weight, + }; + return res; } function make_int_pgs(weights, pg_count) @@ -210,10 +223,14 @@ function add_valid_previous(osd_tree, prev_weights, all_pgs) } // Try to minimize data movement -async function optimize_change(prev_int_pgs, osd_tree, pg_size, max_combinations) +async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1 }) { - max_combinations = max_combinations || 10000; - const pg_effsize = Math.min(Object.keys(osd_tree).length, pg_size); + if (!osd_tree) + { + return null; + } + const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length) + + Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space; const pg_count = prev_int_pgs.length; const prev_weights = {}; const prev_pg_per_osd = {}; @@ -221,10 +238,11 @@ async function optimize_change(prev_int_pgs, osd_tree, pg_size, max_combinations { const pg_name = 'pg_'+pg.join('_'); prev_weights[pg_name] = (prev_weights[pg_name]||0) + 1; - for (const osd of pg) + for (let i = 0; i < pg.length; i++) { + const osd = pg[i]; prev_pg_per_osd[osd] = prev_pg_per_osd[osd] || []; - prev_pg_per_osd[osd].push(pg_name); + prev_pg_per_osd[osd].push([ pg_name, (i >= pg_minsize ? parity_space : 1) ]); } } // Get all combinations @@ -235,10 +253,11 @@ async function optimize_change(prev_int_pgs, osd_tree, pg_size, max_combinations for (const pg of all_pgs) { const pg_name = 'pg_'+pg.join('_'); - for (const osd of pg) + for (let i = 0; i < pg.length; i++) { + const osd = pg[i]; pg_per_osd[osd] = pg_per_osd[osd] || []; - pg_per_osd[osd].push(pg_name); + pg_per_osd[osd].push([ pg_name, (i >= pg_minsize ? parity_space : 1) ]); } } // Penalize PGs based on their similarity to old PGs @@ -257,9 +276,12 @@ async function optimize_change(prev_int_pgs, osd_tree, pg_size, max_combinations { if (osd !== NO_OSD) { - const osd_sum = (pg_per_osd[osd]||[]).map(pg_name => prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : pg_name).join(' + '); - const rm_osd_pg_count = (prev_pg_per_osd[osd]||[]).filter(old_pg_name => all_pgs_hash[old_pg_name]).length; - let osd_pg_count = all_weights[osd]*pg_size/total_weight*pg_count - rm_osd_pg_count; + const osd_sum = (pg_per_osd[osd]||[]).map(([ pg_name, space ]) => ( + prev_weights[pg_name] ? `${space} * add_${pg_name} - ${space} * del_${pg_name}` : `${space} * ${pg_name}` + )).join(' + '); + const rm_osd_pg_count = (prev_pg_per_osd[osd]||[]) + .reduce((a, [ old_pg_name, space ]) => (a + (all_pgs_hash[old_pg_name] ? space : 0)), 0); + const osd_pg_count = all_weights[osd]*pg_effsize/total_weight*pg_count - rm_osd_pg_count; lp += osd_sum + ' <= ' + osd_pg_count + ';\n'; } } @@ -308,7 +330,7 @@ async function optimize_change(prev_int_pgs, osd_tree, pg_size, max_combinations { weights[k.substr(4)] = (weights[k.substr(4)] || 0) - Number(lp_result.vars[k]); } - else + else if (k.substr(0, 3) === 'pg_') { weights[k] = Number(lp_result.vars[k]); } @@ -345,7 +367,7 @@ async function optimize_change(prev_int_pgs, osd_tree, pg_size, max_combinations int_pgs: new_pgs, differs, osd_differs, - space: pg_effsize * pg_list_space_efficiency(new_pgs, all_weights), + space: pg_effsize * pg_list_space_efficiency(new_pgs, all_weights, pg_minsize, parity_space), total_space: total_weight, }; } @@ -608,14 +630,15 @@ function pg_weights_space_efficiency(weights, pg_count, osd_sizes) return pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes); } -function pg_list_space_efficiency(pgs, osd_sizes) +function pg_list_space_efficiency(pgs, osd_sizes, pg_minsize, parity_space) { const per_osd = {}; for (const pg of pgs) { - for (const osd of pg) + for (let i = 0; i < pg.length; i++) { - per_osd[osd] = (per_osd[osd]||0) + 1; + const osd = pg[i]; + per_osd[osd] = (per_osd[osd]||0) + (i >= pg_minsize ? (parity_space||1) : 1); } } return pg_per_osd_space_efficiency(per_osd, pgs.length, osd_sizes); diff --git a/lp/mon.js b/lp/mon.js index 713205fc3..a43f07a21 100644 --- a/lp/mon.js +++ b/lp/mon.js @@ -550,11 +550,21 @@ class Mon } this.scale_pg_count(prev_pgs, pg_history, new_pg_count); } - optimize_result = await LPOptimizer.optimize_change(prev_pgs, tree_cfg.osd_tree, 3, tree_cfg.max_osd_combinations); + optimize_result = await LPOptimizer.optimize_change({ + prev_pgs, + osd_tree: tree_cfg.osd_tree, + pg_size: 3, + max_combinations: tree_cfg.max_osd_combinations, + }); } else { - optimize_result = await LPOptimizer.optimize_initial(tree_cfg.osd_tree, 3, tree_cfg.pg_count, tree_cfg.max_osd_combinations); + optimize_result = await LPOptimizer.optimize_initial({ + osd_tree: tree_cfg.osd_tree, + pg_size: 3, + pg_count: tree_cfg.pg_count, + max_combinations: tree_cfg.max_osd_combinations, + }); } if (!await this.save_new_pgs(prev_pgs, optimize_result.int_pgs, pg_history, tree_hash)) { diff --git a/lp/test-nonuniform.js b/lp/test-nonuniform.js new file mode 100644 index 000000000..1a47af353 --- /dev/null +++ b/lp/test-nonuniform.js @@ -0,0 +1,127 @@ +// Interesting real-world example coming from Ceph with EC and compression enabled. +// EC parity chunks can't be compressed as efficiently as data chunks, +// thus they occupy more space (2.26x more space) in OSD object stores. +// This leads to really uneven OSD fill ratio in Ceph even when PGs are perfectly balanced. +// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change(). + +const LPOptimizer = require('./lp-optimizer.js'); + +const osd_tree = { + ripper5: { + osd0: 3.493144989013672, + osd1: 3.493144989013672, + osd2: 3.454082489013672, + osd12: 3.461894989013672, + }, + ripper7: { + osd4: 3.638690948486328, + osd5: 3.638690948486328, + osd6: 3.638690948486328, + }, + ripper4: { + osd9: 3.4609375, + osd10: 3.4609375, + osd11: 3.4609375, + }, + ripper6: { + osd3: 3.5849609375, + osd7: 3.5859336853027344, + osd8: 3.638690948486328, + osd13: 3.461894989013672 + }, +}; + +const prev_pgs = [[12,7,5],[6,11,12],[3,6,9],[10,0,5],[2,5,13],[9,8,6],[3,4,12],[7,4,12],[12,11,13],[13,6,0],[4,13,10],[9,7,6],[7,10,0],[10,8,0],[3,10,2],[3,0,4],[6,13,0],[13,10,0],[13,10,5],[8,11,6],[3,9,2],[2,8,5],[8,9,5],[3,12,11],[0,7,4],[13,11,1],[11,3,12],[12,8,10],[7,5,12],[2,13,5],[7,11,0],[13,2,6],[0,6,8],[13,1,6],[0,13,4],[0,8,10],[4,10,0],[8,12,4],[8,12,9],[12,7,4],[13,9,5],[3,2,11],[1,9,7],[1,8,5],[5,12,9],[3,5,12],[2,8,10],[0,8,4],[1,4,11],[7,10,2],[12,13,5],[3,1,11],[7,1,4],[4,12,8],[7,0,9],[11,1,8],[3,0,5],[11,13,0],[1,13,5],[12,7,10],[12,8,4],[11,13,5],[0,11,6],[2,11,3],[13,1,11],[2,7,10],[7,10,12],[7,12,10],[12,11,5],[13,12,10],[2,3,9],[4,3,9],[13,2,5],[7,12,6],[12,10,13],[9,8,1],[13,1,5],[9,5,12],[5,11,7],[6,2,9],[8,11,6],[12,5,8],[6,13,1],[7,6,11],[2,3,6],[8,5,9],[1,13,6],[9,3,2],[7,11,1],[3,10,1],[0,11,7],[3,0,5],[1,3,6],[6,0,9],[3,11,4],[8,10,2],[13,1,9],[12,6,9],[3,12,9],[12,8,9],[7,5,0],[8,12,5],[0,11,3],[12,11,13],[0,7,11],[0,3,10],[1,3,11],[2,7,11],[13,2,6],[9,12,13],[8,2,4],[0,7,4],[5,13,0],[13,12,9],[1,9,8],[0,10,3],[3,5,10],[7,12,9],[2,13,4],[12,7,5],[9,2,7],[3,2,9],[6,2,7],[3,1,9],[4,3,2],[5,3,11],[0,7,6],[1,6,13],[7,10,2],[12,4,8],[13,12,6],[7,5,11],[6,2,3],[2,7,6],[2,3,10],[2,7,10],[11,12,6],[0,13,5],[10,2,4],[13,0,11],[7,0,6],[8,9,4],[8,4,11],[7,11,2],[3,4,2],[6,1,3],[7,2,11],[8,9,4],[11,4,8],[10,3,1],[2,10,13],[1,7,11],[13,11,12],[2,6,9],[10,0,13],[7,10,4],[0,11,13],[13,10,1],[7,5,0],[7,12,10],[3,1,4],[7,1,5],[3,11,5],[7,5,0],[1,3,5],[10,5,12],[0,3,9],[7,1,11],[11,8,12],[3,6,2],[7,12,9],[7,11,12],[4,11,3],[0,11,13],[13,2,5],[1,5,8],[0,11,8],[3,5,1],[11,0,6],[3,11,2],[11,8,12],[4,1,3],[10,13,4],[13,9,6],[2,3,10],[12,7,9],[10,0,4],[10,13,2],[3,11,1],[7,2,9],[1,7,4],[13,1,4],[7,0,6],[5,3,9],[10,0,7],[0,7,10],[3,6,10],[13,0,5],[8,4,1],[3,1,10],[2,10,13],[13,0,5],[13,10,2],[12,7,9],[6,8,10],[6,1,8],[10,8,1],[13,5,0],[5,11,3],[7,6,1],[8,5,9],[2,13,11],[10,12,4],[13,4,1],[2,13,4],[11,7,0],[2,9,7],[1,7,6],[8,0,4],[8,1,9],[7,10,12],[13,9,6],[7,6,11],[13,0,4],[1,8,4],[3,12,5],[10,3,1],[10,2,13],[2,4,8],[6,2,3],[3,0,10],[6,7,12],[8,12,5],[3,0,6],[13,12,10],[11,3,6],[9,0,13],[10,0,6],[7,5,2],[1,3,11],[7,10,2],[2,9,8],[11,13,12],[0,8,4],[8,12,11],[6,0,3],[1,13,4],[11,8,2],[12,3,6],[4,7,1],[7,6,12],[3,10,6],[0,10,7],[8,9,1],[0,10,6],[8,10,1]] + .map(pg => pg.map(n => 'osd'+n)); + +const by_osd = {}; + +for (let i = 0; i < prev_pgs.length; i++) +{ + for (let j = 0; j < prev_pgs[i].length; j++) + { + by_osd[prev_pgs[i][j]] = by_osd[prev_pgs[i][j]] || []; + by_osd[prev_pgs[i][j]][j] = (by_osd[prev_pgs[i][j]][j] || 0) + 1; + } +} + +/* + +This set of PGs was balanced by hand, by heavily tuning OSD weights in Ceph: + +{ + osd0: 4.2, + osd1: 3.5, + osd2: 3.45409, + osd3: 4.5, + osd4: 1.4, + osd5: 1.4, + osd6: 1.75, + osd7: 4.5, + osd8: 4.4, + osd9: 2.2, + osd10: 2.7, + osd11: 2, + osd12: 3.4, + osd13: 3.4, +} + +EC+compression is a nightmare in Ceph, yeah :)) + +To calculate the average ratio between data chunks and parity chunks we +calculate the number of PG chunks for each chunk role for each OSD: + +{ + osd12: [ 18, 22, 17 ], + osd7: [ 35, 22, 8 ], + osd5: [ 6, 17, 27 ], + osd6: [ 13, 12, 28 ], + osd11: [ 13, 26, 20 ], + osd3: [ 30, 20, 10 ], + osd9: [ 8, 12, 26 ], + osd10: [ 15, 23, 20 ], + osd0: [ 22, 22, 14 ], + osd2: [ 22, 16, 16 ], + osd13: [ 29, 19, 13 ], + osd8: [ 20, 18, 12 ], + osd4: [ 8, 10, 28 ], + osd1: [ 17, 17, 17 ] +} + +And now we can pick a pair of OSDs and determine the ratio by solving the following: + +osd5 = 23*X + 27*Y = 3249728140 +osd13 = 48*X + 13*Y = 2991675992 + +=> + +osd5 - 27/13*osd13 = 23*X - 27/13*48*X = -76.6923076923077*X = -2963752766.46154 + +=> + +X = 38644720.1243731 +Y = (osd5-23*X)/27 = 87440725.0792377 +Y/X = 2.26268232239284 ~= 2.26 + +Which means that parity chunks are compressed ~2.26 times worse than data chunks. + +Fine, let's try to optimize for it. + +*/ + +async function run() +{ + const all_weights = Object.assign({}, ...Object.values(osd_tree)); + const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0); + const eff = LPOptimizer.pg_list_space_efficiency(prev_pgs, all_weights, 2, 2.26); + const orig = eff*4.26 / total_weight; + console.log('Original efficiency was: '+Math.round(orig*10000)/100+' %'); + + let prev = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256, parity_space: 2.26 }); + LPOptimizer.print_change_stats(prev); + + let next = await LPOptimizer.optimize_change({ prev_pgs, osd_tree, pg_size: 3, max_combinations: 10000, parity_space: 2.26 }); + LPOptimizer.print_change_stats(next); +} + +run().catch(console.error); diff --git a/lp/test-optimize-undersized.js b/lp/test-optimize-undersized.js index b2913a7c2..b9031d13d 100644 --- a/lp/test-optimize-undersized.js +++ b/lp/test-optimize-undersized.js @@ -40,31 +40,31 @@ async function run() { const cur_tree = {}; console.log('Empty tree:'); - let res = await LPOptimizer.optimize_initial(cur_tree, 3, 256); + let res = await LPOptimizer.optimize_initial({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 }); LPOptimizer.print_change_stats(res, false); console.log('\nAdding 1st failure domain:'); cur_tree['dom1'] = osd_tree['dom1']; - res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); console.log('\nAdding 2nd failure domain:'); cur_tree['dom2'] = osd_tree['dom2']; - res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); console.log('\nAdding 3rd failure domain:'); cur_tree['dom3'] = osd_tree['dom3']; - res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); console.log('\nRemoving 3rd failure domain:'); delete cur_tree['dom3']; - res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); console.log('\nRemoving 2nd failure domain:'); delete cur_tree['dom2']; - res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); console.log('\nRemoving 1st failure domain:'); delete cur_tree['dom1']; - res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); } diff --git a/lp/test-optimize.js b/lp/test-optimize.js index 62fa75592..91cda9c46 100644 --- a/lp/test-optimize.js +++ b/lp/test-optimize.js @@ -81,31 +81,31 @@ async function run() // Space efficiency is ~99% in all cases. console.log('256 PGs, size=2'); - res = await LPOptimizer.optimize_initial(osd_tree, 2, 256); + res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 2, pg_count: 256 }); LPOptimizer.print_change_stats(res, false); console.log('\nAdding osd.8'); osd_tree[500][8] = 3.58589; - res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree, 2); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }); LPOptimizer.print_change_stats(res, false); console.log('\nRemoving osd.8'); delete osd_tree[500][8]; - res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree, 2); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }); LPOptimizer.print_change_stats(res, false); console.log('\n256 PGs, size=3'); - res = await LPOptimizer.optimize_initial(osd_tree, 3, 256); + res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256 }); LPOptimizer.print_change_stats(res, false); console.log('\nAdding osd.8'); osd_tree[500][8] = 3.58589; - res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); console.log('\nRemoving osd.8'); delete osd_tree[500][8]; - res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree, 3); + res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 }); LPOptimizer.print_change_stats(res, false); console.log('\n256 PGs, size=3, failure domain=rack'); - res = await LPOptimizer.optimize_initial(LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), 3, 256); + res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 }); LPOptimizer.print_change_stats(res, false); }