Wrap cluster AFR calculator in a webpage
commit
07fbb7e98c
|
@ -0,0 +1,7 @@
|
||||||
|
{
|
||||||
|
"presets": [ [ "env" ], "stage-1" ],
|
||||||
|
"retainLines": true,
|
||||||
|
"plugins": [
|
||||||
|
[ "transform-react-jsx", { "pragma": "preact.h" } ]
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,145 @@
|
||||||
|
// Functions to calculate Annualized Failure Rate of your cluster
|
||||||
|
// if you know AFR of your drives, number of drives, expected rebalance time
|
||||||
|
// and replication factor
|
||||||
|
// License: VNPL-1.0 (see https://yourcmc.ru/git/vitalif/vitastor/src/branch/master/README.md for details) or AGPL-3.0
|
||||||
|
// Author: Vitaliy Filippov, 2020+
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
cluster_afr_fullmesh,
|
||||||
|
failure_rate_fullmesh,
|
||||||
|
cluster_afr,
|
||||||
|
print_cluster_afr,
|
||||||
|
c_n_k,
|
||||||
|
};
|
||||||
|
|
||||||
|
/******** "FULL MESH": ASSUME EACH OSD COMMUNICATES WITH ALL OTHER OSDS ********/
|
||||||
|
|
||||||
|
// Estimate AFR of the cluster
|
||||||
|
// n - number of drives
|
||||||
|
// afr - annualized failure rate of a single drive
|
||||||
|
// l - expected rebalance time in days after a single drive failure
|
||||||
|
// k - replication factor / number of drives that must fail at the same time for the cluster to fail
|
||||||
|
function cluster_afr_fullmesh(n, afr, l, k)
|
||||||
|
{
|
||||||
|
return 1 - (1 - afr * failure_rate_fullmesh(n-(k-1), afr*l/365, k-1)) ** (n-(k-1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Probability of at least <f> failures in a cluster with <n> drives with AFR=<a>
|
||||||
|
function failure_rate_fullmesh(n, a, f)
|
||||||
|
{
|
||||||
|
if (f <= 0)
|
||||||
|
{
|
||||||
|
return (1-a)**n;
|
||||||
|
}
|
||||||
|
let p = 1;
|
||||||
|
for (let i = 0; i < f; i++)
|
||||||
|
{
|
||||||
|
p -= c_n_k(n, i) * (1-a)**(n-i) * a**i;
|
||||||
|
}
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
/******** PGS: EACH OSD ONLY COMMUNICATES WITH <pgs> OTHER OSDs ********/
|
||||||
|
|
||||||
|
// <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
|
||||||
|
// <k> replicas, <pgs> unique peer PGs per OSD
|
||||||
|
//
|
||||||
|
// For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
|
||||||
|
// More peers per OSD increase rebalance speed (more drives work together to resilver) if you
|
||||||
|
// let them finish rebalance BEFORE replacing the failed drive.
|
||||||
|
// At the same time, more peers per OSD increase probability of any of them to fail!
|
||||||
|
//
|
||||||
|
// Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
|
||||||
|
// So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
|
||||||
|
// is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
|
||||||
|
// because it gets increased linearly by increased number of peers to fail
|
||||||
|
// and decreased linearly by reduced rebalance time.
|
||||||
|
function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement })
|
||||||
|
{
|
||||||
|
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
|
||||||
|
const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||||
|
return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
|
||||||
|
}
|
||||||
|
|
||||||
|
function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
|
||||||
|
{
|
||||||
|
const ec_total = ec_data+ec_parity;
|
||||||
|
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
|
||||||
|
const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||||
|
return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same as above, but also take server failures into account
|
||||||
|
function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement })
|
||||||
|
{
|
||||||
|
let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
|
||||||
|
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
|
||||||
|
let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
|
||||||
|
const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||||
|
const lh = n_drives*capacity/pgs/speed/86400/365;
|
||||||
|
const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
|
||||||
|
const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
|
||||||
|
return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) *
|
||||||
|
((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives));
|
||||||
|
}
|
||||||
|
|
||||||
|
function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
|
||||||
|
{
|
||||||
|
const ec_total = ec_data+ec_parity;
|
||||||
|
const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1));
|
||||||
|
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
|
||||||
|
const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1));
|
||||||
|
const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||||
|
const lh = n_drives*capacity/pgs/speed/86400/365;
|
||||||
|
const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
|
||||||
|
const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
|
||||||
|
return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) *
|
||||||
|
((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wrapper for 4 above functions
|
||||||
|
function cluster_afr(config)
|
||||||
|
{
|
||||||
|
if (config.ec && config.afr_host)
|
||||||
|
{
|
||||||
|
return cluster_afr_pgs_ec_hosts(config);
|
||||||
|
}
|
||||||
|
else if (config.ec)
|
||||||
|
{
|
||||||
|
return cluster_afr_pgs_ec(config);
|
||||||
|
}
|
||||||
|
else if (config.afr_host)
|
||||||
|
{
|
||||||
|
return cluster_afr_pgs_hosts(config);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return cluster_afr_pgs(config);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function print_cluster_afr(config)
|
||||||
|
{
|
||||||
|
console.log(
|
||||||
|
`${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
|
||||||
|
`, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
|
||||||
|
(config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
|
||||||
|
(config.ec ? `, EC ${config.ec[0]}+${config.ec[1]}` : `, ${config.replicas} replicas`)+
|
||||||
|
`, ${config.pgs||1} PG per OSD`+
|
||||||
|
(config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
|
||||||
|
);
|
||||||
|
console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
/******** UTILITY ********/
|
||||||
|
|
||||||
|
// Combination count
|
||||||
|
function c_n_k(n, k)
|
||||||
|
{
|
||||||
|
let r = 1;
|
||||||
|
for (let i = 0; i < k; i++)
|
||||||
|
{
|
||||||
|
r *= (n-i) / (i+1);
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Калькулятор вероятности отказа кластера Ceph/Vitastor</title>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<style>
|
||||||
|
* { box-sizing: border-box; }
|
||||||
|
body { margin: 0; font-size: 15px; font-family: Arial, Helvetica, sans-serif; }
|
||||||
|
input { font-size: inherit; font-family: inherit; vertical-align: middle; }
|
||||||
|
table { border-collapse: collapse; margin-left: 110px; }
|
||||||
|
td { padding: 5px; }
|
||||||
|
th { text-align: left; font-weight: normal; white-space: nowrap; width: 1%; }
|
||||||
|
input[type="text"] { border: 1px solid #aaa; padding: 4px; border-radius: 3px; }
|
||||||
|
.switch { float: left; border: 1px solid #aaa; color: #666; height: 30px; padding: 5px 10px; transition: all 200ms ease-in-out; cursor: pointer; position: relative; overflow: hidden; }
|
||||||
|
.switch input { visibility: hidden; position: absolute; top: 0; left: 0; right: 0; bottom: 0; }
|
||||||
|
.switch.l { border-right-width: 0; border-radius: 20px 0 0 20px; padding-left: 15px; }
|
||||||
|
.switch.r { border-left-width: 0; border-radius: 0 20px 20px 0; padding-right: 15px; }
|
||||||
|
.switch.sel { border-color: #08f; background: #08f; color: white; }
|
||||||
|
.switch:hover { border-color: #4af; background: #4af; color: white; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
<script type="text/javascript" src="dist/main.js"></script>
|
||||||
|
</html>
|
|
@ -0,0 +1,151 @@
|
||||||
|
import * as preact from 'preact';
|
||||||
|
/** @jsx preact.h */
|
||||||
|
import { cluster_afr } from './afr.js';
|
||||||
|
|
||||||
|
class Calc extends preact.Component
|
||||||
|
{
|
||||||
|
state = {
|
||||||
|
hosts: 10,
|
||||||
|
drives: 10,
|
||||||
|
afr_drive: 3,
|
||||||
|
afr_host: 5,
|
||||||
|
capacity: 8,
|
||||||
|
speed: 20,
|
||||||
|
ec: false,
|
||||||
|
replicas: 2,
|
||||||
|
ec_data: 2,
|
||||||
|
ec_parity: 1,
|
||||||
|
eager: false,
|
||||||
|
result: 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
calc(st)
|
||||||
|
{
|
||||||
|
st = { ...this.state, ...st };
|
||||||
|
st.result = 100*cluster_afr({
|
||||||
|
n_hosts: st.hosts,
|
||||||
|
n_drives: st.drives,
|
||||||
|
afr_drive: st.afr_drive/100,
|
||||||
|
afr_host: st.afr_host/100,
|
||||||
|
capacity: st.capacity*1000,
|
||||||
|
speed: st.speed/1000,
|
||||||
|
ec: st.ec ? [ st.ec_data, st.ec_parity ] : null,
|
||||||
|
replicas: st.replicas,
|
||||||
|
pgs: 100,
|
||||||
|
degraded_replacement: st.eager,
|
||||||
|
});
|
||||||
|
this.setState(st);
|
||||||
|
}
|
||||||
|
|
||||||
|
setter(field)
|
||||||
|
{
|
||||||
|
if (!this.setter[field])
|
||||||
|
{
|
||||||
|
this.setter[field] = (event) =>
|
||||||
|
{
|
||||||
|
this.calc({ [field]: event.target.value });
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return this.setter[field];
|
||||||
|
}
|
||||||
|
|
||||||
|
setRepl = () =>
|
||||||
|
{
|
||||||
|
this.calc({ ec: false });
|
||||||
|
}
|
||||||
|
|
||||||
|
setEC = () =>
|
||||||
|
{
|
||||||
|
this.calc({ ec: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
setEager = (event) =>
|
||||||
|
{
|
||||||
|
this.calc({ eager: event.target.checked });
|
||||||
|
}
|
||||||
|
|
||||||
|
componentDidMount()
|
||||||
|
{
|
||||||
|
this.calc({});
|
||||||
|
}
|
||||||
|
|
||||||
|
render(props, state)
|
||||||
|
{
|
||||||
|
return (<div style="width: 750px; margin: 20px; padding: 20px; box-shadow: 0 19px 60px rgba(0, 0, 0, 0.3), 0 15px 20px rgba(0, 0, 0, 0.22);">
|
||||||
|
<h2 style="text-align: center; font-size: 150%; margin: 10px 0 20px 0; font-weight: bold">
|
||||||
|
Калькулятор вероятности отказа кластера Ceph/Vitastor
|
||||||
|
</h2>
|
||||||
|
<p>
|
||||||
|
Вероятность полного отказа кластера зависит от числа серверов и дисков
|
||||||
|
(чем их больше, тем вероятность больше), от схемы избыточности, скорости ребаланса (восстановления),
|
||||||
|
и, конечно, непосредственно вероятности выхода из строя самих дисков и серверов.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
Расчёт ведётся в простом предположении, что отказы распределены равномерно во времени.
|
||||||
|
</p>
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Число серверов</th>
|
||||||
|
<td><input type="text" value={state.hosts} onchange={this.setter('hosts')} /></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Число дисков в сервере</th>
|
||||||
|
<td><input type="text" value={state.drives} onchange={this.setter('drives')} /></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Ёмкость дисков</th>
|
||||||
|
<td><input type="text" value={state.capacity} onchange={this.setter('capacity')} /> ТБ</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Схема избыточности</th>
|
||||||
|
<td>
|
||||||
|
<label class={"switch l"+(state.ec ? "" : " sel")}>
|
||||||
|
<input type="radio" name="scheme" checked={!state.ec} onclick={this.setRepl} /> Репликация
|
||||||
|
</label>
|
||||||
|
<label class={"switch r"+(state.ec ? " sel" : "")}>
|
||||||
|
<input type="radio" name="scheme" checked={state.ec} onclick={this.setEC} /> EC (коды коррекции ошибок)
|
||||||
|
</label>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{state.ec ? null : <tr>
|
||||||
|
<th>Число реплик</th>
|
||||||
|
<td><input type="text" value={state.replicas} onchange={this.setter('replicas')} /></td>
|
||||||
|
</tr>}
|
||||||
|
{state.ec ? <tr>
|
||||||
|
<th>Число дисков данных</th>
|
||||||
|
<td><input type="text" value={state.ec_data} onchange={this.setter('ec_data')} /></td>
|
||||||
|
</tr> : null}
|
||||||
|
{state.ec ? <tr>
|
||||||
|
<th>Число дисков чётности</th>
|
||||||
|
<td><input type="text" value={state.ec_parity} onchange={this.setter('ec_parity')} /></td>
|
||||||
|
</tr> : null}
|
||||||
|
<tr>
|
||||||
|
<th>Оценочная скорость<br />восстановления на 1 OSD</th>
|
||||||
|
<td><input type="text" value={state.speed} onchange={this.setter('speed')} /> МБ/с</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th><abbr title="Annualized Failure Rate, вероятность отказа в течение года в %">AFR</abbr> диска</th>
|
||||||
|
<td><input type="text" value={state.afr_drive} onchange={this.setter('afr_drive')} /> %</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>AFR сервера</th>
|
||||||
|
<td><input type="text" value={state.afr_host} onchange={this.setter('afr_host')} /> %</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
<p>
|
||||||
|
<label><input type="checkbox" checked={state.eager} onchange={this.setEager} />
|
||||||
|
Я нетерпеливый и заменяю отказавший диск сразу, не давая данным уехать на остальные диски
|
||||||
|
(либо данным уезжать некуда, например, сервера всего 3 при 3 репликах)
|
||||||
|
</label>
|
||||||
|
</p>
|
||||||
|
<div style="text-align: center; font-size: 150%; margin: 20px 0; font-weight: bold">
|
||||||
|
Вероятность потери данных в течение года:
|
||||||
|
</div>
|
||||||
|
<div style="text-align: center; font-size: 200%; margin: 10px 0; font-weight: bold">
|
||||||
|
{Math.round(state.result*10000)/10000} %
|
||||||
|
</div>
|
||||||
|
</div>);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
preact.render(<Calc />, document.body);
|
|
@ -0,0 +1,26 @@
|
||||||
|
{
|
||||||
|
"name": "ceph-afr-calc",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "Ceph/Vitastor cluster failure calculator",
|
||||||
|
"main": "main.js",
|
||||||
|
"scripts": {
|
||||||
|
"build": "webpack",
|
||||||
|
"watch-dev": "NODE_ENV=development webpack --mode=development -w"
|
||||||
|
},
|
||||||
|
"author": "Vitaliy Filippov",
|
||||||
|
"license": "AGPL-3.0",
|
||||||
|
"devDependencies": {
|
||||||
|
"webpack-cli": "^4.3.1"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"babel-cli": "^6.26.0",
|
||||||
|
"babel-core": "^6.26.3",
|
||||||
|
"babel-loader": "^7.1.4",
|
||||||
|
"babel-preset-env": "^1.7.0",
|
||||||
|
"babel-preset-react": "^6.24.1",
|
||||||
|
"babel-preset-stage-1": "^6.24.1",
|
||||||
|
"preact": "^10.5.10",
|
||||||
|
"webpack": "^4.26.1",
|
||||||
|
"webpack-cli": "^3.0.8"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
const webpack = require('webpack');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
entry: { 'main': [ './main.js' ] },
|
||||||
|
context: __dirname,
|
||||||
|
output: {
|
||||||
|
path: __dirname,
|
||||||
|
filename: './dist/[name].js'
|
||||||
|
},
|
||||||
|
devtool: 'cheap-module-source-map',
|
||||||
|
module: {
|
||||||
|
rules: [
|
||||||
|
{
|
||||||
|
test: /.jsx?$/,
|
||||||
|
loader: 'babel-loader',
|
||||||
|
exclude: /node_modules/
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
plugins: [
|
||||||
|
new webpack.DefinePlugin({
|
||||||
|
"process.env": {
|
||||||
|
NODE_ENV: JSON.stringify(process.env.NODE_ENV || "production")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
]
|
||||||
|
};
|
Loading…
Reference in New Issue