Wrap cluster AFR calculator in a webpage
commit
07fbb7e98c
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"presets": [ [ "env" ], "stage-1" ],
|
||||
"retainLines": true,
|
||||
"plugins": [
|
||||
[ "transform-react-jsx", { "pragma": "preact.h" } ]
|
||||
]
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
// Functions to calculate Annualized Failure Rate of your cluster
|
||||
// if you know AFR of your drives, number of drives, expected rebalance time
|
||||
// and replication factor
|
||||
// License: VNPL-1.0 (see https://yourcmc.ru/git/vitalif/vitastor/src/branch/master/README.md for details) or AGPL-3.0
|
||||
// Author: Vitaliy Filippov, 2020+
|
||||
|
||||
module.exports = {
|
||||
cluster_afr_fullmesh,
|
||||
failure_rate_fullmesh,
|
||||
cluster_afr,
|
||||
print_cluster_afr,
|
||||
c_n_k,
|
||||
};
|
||||
|
||||
/******** "FULL MESH": ASSUME EACH OSD COMMUNICATES WITH ALL OTHER OSDS ********/
|
||||
|
||||
// Estimate AFR of the cluster
|
||||
// n - number of drives
|
||||
// afr - annualized failure rate of a single drive
|
||||
// l - expected rebalance time in days after a single drive failure
|
||||
// k - replication factor / number of drives that must fail at the same time for the cluster to fail
|
||||
function cluster_afr_fullmesh(n, afr, l, k)
|
||||
{
|
||||
return 1 - (1 - afr * failure_rate_fullmesh(n-(k-1), afr*l/365, k-1)) ** (n-(k-1));
|
||||
}
|
||||
|
||||
// Probability of at least <f> failures in a cluster with <n> drives with AFR=<a>
|
||||
function failure_rate_fullmesh(n, a, f)
|
||||
{
|
||||
if (f <= 0)
|
||||
{
|
||||
return (1-a)**n;
|
||||
}
|
||||
let p = 1;
|
||||
for (let i = 0; i < f; i++)
|
||||
{
|
||||
p -= c_n_k(n, i) * (1-a)**(n-i) * a**i;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/******** PGS: EACH OSD ONLY COMMUNICATES WITH <pgs> OTHER OSDs ********/
|
||||
|
||||
// <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
|
||||
// <k> replicas, <pgs> unique peer PGs per OSD
|
||||
//
|
||||
// For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
|
||||
// More peers per OSD increase rebalance speed (more drives work together to resilver) if you
|
||||
// let them finish rebalance BEFORE replacing the failed drive.
|
||||
// At the same time, more peers per OSD increase probability of any of them to fail!
|
||||
//
|
||||
// Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
|
||||
// So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
|
||||
// is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
|
||||
// because it gets increased linearly by increased number of peers to fail
|
||||
// and decreased linearly by reduced rebalance time.
|
||||
function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement })
|
||||
{
|
||||
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
|
||||
const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||
return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
|
||||
}
|
||||
|
||||
function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
|
||||
{
|
||||
const ec_total = ec_data+ec_parity;
|
||||
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
|
||||
const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||
return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives);
|
||||
}
|
||||
|
||||
// Same as above, but also take server failures into account
|
||||
function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement })
|
||||
{
|
||||
let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
|
||||
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
|
||||
let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
|
||||
const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||
const lh = n_drives*capacity/pgs/speed/86400/365;
|
||||
const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
|
||||
const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
|
||||
return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) *
|
||||
((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives));
|
||||
}
|
||||
|
||||
function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
|
||||
{
|
||||
const ec_total = ec_data+ec_parity;
|
||||
const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1));
|
||||
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
|
||||
const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1));
|
||||
const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
|
||||
const lh = n_drives*capacity/pgs/speed/86400/365;
|
||||
const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
|
||||
const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
|
||||
return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) *
|
||||
((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives));
|
||||
}
|
||||
|
||||
// Wrapper for 4 above functions
|
||||
function cluster_afr(config)
|
||||
{
|
||||
if (config.ec && config.afr_host)
|
||||
{
|
||||
return cluster_afr_pgs_ec_hosts(config);
|
||||
}
|
||||
else if (config.ec)
|
||||
{
|
||||
return cluster_afr_pgs_ec(config);
|
||||
}
|
||||
else if (config.afr_host)
|
||||
{
|
||||
return cluster_afr_pgs_hosts(config);
|
||||
}
|
||||
else
|
||||
{
|
||||
return cluster_afr_pgs(config);
|
||||
}
|
||||
}
|
||||
|
||||
function print_cluster_afr(config)
|
||||
{
|
||||
console.log(
|
||||
`${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
|
||||
`, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
|
||||
(config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
|
||||
(config.ec ? `, EC ${config.ec[0]}+${config.ec[1]}` : `, ${config.replicas} replicas`)+
|
||||
`, ${config.pgs||1} PG per OSD`+
|
||||
(config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
|
||||
);
|
||||
console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
|
||||
}
|
||||
|
||||
/******** UTILITY ********/
|
||||
|
||||
// Combination count
|
||||
function c_n_k(n, k)
|
||||
{
|
||||
let r = 1;
|
||||
for (let i = 0; i < k; i++)
|
||||
{
|
||||
r *= (n-i) / (i+1);
|
||||
}
|
||||
return r;
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Калькулятор вероятности отказа кластера Ceph/Vitastor</title>
|
||||
<meta charset="utf-8" />
|
||||
<style>
|
||||
* { box-sizing: border-box; }
|
||||
body { margin: 0; font-size: 15px; font-family: Arial, Helvetica, sans-serif; }
|
||||
input { font-size: inherit; font-family: inherit; vertical-align: middle; }
|
||||
table { border-collapse: collapse; margin-left: 110px; }
|
||||
td { padding: 5px; }
|
||||
th { text-align: left; font-weight: normal; white-space: nowrap; width: 1%; }
|
||||
input[type="text"] { border: 1px solid #aaa; padding: 4px; border-radius: 3px; }
|
||||
.switch { float: left; border: 1px solid #aaa; color: #666; height: 30px; padding: 5px 10px; transition: all 200ms ease-in-out; cursor: pointer; position: relative; overflow: hidden; }
|
||||
.switch input { visibility: hidden; position: absolute; top: 0; left: 0; right: 0; bottom: 0; }
|
||||
.switch.l { border-right-width: 0; border-radius: 20px 0 0 20px; padding-left: 15px; }
|
||||
.switch.r { border-left-width: 0; border-radius: 0 20px 20px 0; padding-right: 15px; }
|
||||
.switch.sel { border-color: #08f; background: #08f; color: white; }
|
||||
.switch:hover { border-color: #4af; background: #4af; color: white; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
<script type="text/javascript" src="dist/main.js"></script>
|
||||
</html>
|
|
@ -0,0 +1,151 @@
|
|||
import * as preact from 'preact';
|
||||
/** @jsx preact.h */
|
||||
import { cluster_afr } from './afr.js';
|
||||
|
||||
class Calc extends preact.Component
|
||||
{
|
||||
state = {
|
||||
hosts: 10,
|
||||
drives: 10,
|
||||
afr_drive: 3,
|
||||
afr_host: 5,
|
||||
capacity: 8,
|
||||
speed: 20,
|
||||
ec: false,
|
||||
replicas: 2,
|
||||
ec_data: 2,
|
||||
ec_parity: 1,
|
||||
eager: false,
|
||||
result: 0,
|
||||
}
|
||||
|
||||
calc(st)
|
||||
{
|
||||
st = { ...this.state, ...st };
|
||||
st.result = 100*cluster_afr({
|
||||
n_hosts: st.hosts,
|
||||
n_drives: st.drives,
|
||||
afr_drive: st.afr_drive/100,
|
||||
afr_host: st.afr_host/100,
|
||||
capacity: st.capacity*1000,
|
||||
speed: st.speed/1000,
|
||||
ec: st.ec ? [ st.ec_data, st.ec_parity ] : null,
|
||||
replicas: st.replicas,
|
||||
pgs: 100,
|
||||
degraded_replacement: st.eager,
|
||||
});
|
||||
this.setState(st);
|
||||
}
|
||||
|
||||
setter(field)
|
||||
{
|
||||
if (!this.setter[field])
|
||||
{
|
||||
this.setter[field] = (event) =>
|
||||
{
|
||||
this.calc({ [field]: event.target.value });
|
||||
};
|
||||
}
|
||||
return this.setter[field];
|
||||
}
|
||||
|
||||
setRepl = () =>
|
||||
{
|
||||
this.calc({ ec: false });
|
||||
}
|
||||
|
||||
setEC = () =>
|
||||
{
|
||||
this.calc({ ec: true });
|
||||
}
|
||||
|
||||
setEager = (event) =>
|
||||
{
|
||||
this.calc({ eager: event.target.checked });
|
||||
}
|
||||
|
||||
componentDidMount()
|
||||
{
|
||||
this.calc({});
|
||||
}
|
||||
|
||||
render(props, state)
|
||||
{
|
||||
return (<div style="width: 750px; margin: 20px; padding: 20px; box-shadow: 0 19px 60px rgba(0, 0, 0, 0.3), 0 15px 20px rgba(0, 0, 0, 0.22);">
|
||||
<h2 style="text-align: center; font-size: 150%; margin: 10px 0 20px 0; font-weight: bold">
|
||||
Калькулятор вероятности отказа кластера Ceph/Vitastor
|
||||
</h2>
|
||||
<p>
|
||||
Вероятность полного отказа кластера зависит от числа серверов и дисков
|
||||
(чем их больше, тем вероятность больше), от схемы избыточности, скорости ребаланса (восстановления),
|
||||
и, конечно, непосредственно вероятности выхода из строя самих дисков и серверов.
|
||||
</p>
|
||||
<p>
|
||||
Расчёт ведётся в простом предположении, что отказы распределены равномерно во времени.
|
||||
</p>
|
||||
<table>
|
||||
<tr>
|
||||
<th>Число серверов</th>
|
||||
<td><input type="text" value={state.hosts} onchange={this.setter('hosts')} /></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Число дисков в сервере</th>
|
||||
<td><input type="text" value={state.drives} onchange={this.setter('drives')} /></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Ёмкость дисков</th>
|
||||
<td><input type="text" value={state.capacity} onchange={this.setter('capacity')} /> ТБ</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Схема избыточности</th>
|
||||
<td>
|
||||
<label class={"switch l"+(state.ec ? "" : " sel")}>
|
||||
<input type="radio" name="scheme" checked={!state.ec} onclick={this.setRepl} /> Репликация
|
||||
</label>
|
||||
<label class={"switch r"+(state.ec ? " sel" : "")}>
|
||||
<input type="radio" name="scheme" checked={state.ec} onclick={this.setEC} /> EC (коды коррекции ошибок)
|
||||
</label>
|
||||
</td>
|
||||
</tr>
|
||||
{state.ec ? null : <tr>
|
||||
<th>Число реплик</th>
|
||||
<td><input type="text" value={state.replicas} onchange={this.setter('replicas')} /></td>
|
||||
</tr>}
|
||||
{state.ec ? <tr>
|
||||
<th>Число дисков данных</th>
|
||||
<td><input type="text" value={state.ec_data} onchange={this.setter('ec_data')} /></td>
|
||||
</tr> : null}
|
||||
{state.ec ? <tr>
|
||||
<th>Число дисков чётности</th>
|
||||
<td><input type="text" value={state.ec_parity} onchange={this.setter('ec_parity')} /></td>
|
||||
</tr> : null}
|
||||
<tr>
|
||||
<th>Оценочная скорость<br />восстановления на 1 OSD</th>
|
||||
<td><input type="text" value={state.speed} onchange={this.setter('speed')} /> МБ/с</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><abbr title="Annualized Failure Rate, вероятность отказа в течение года в %">AFR</abbr> диска</th>
|
||||
<td><input type="text" value={state.afr_drive} onchange={this.setter('afr_drive')} /> %</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>AFR сервера</th>
|
||||
<td><input type="text" value={state.afr_host} onchange={this.setter('afr_host')} /> %</td>
|
||||
</tr>
|
||||
</table>
|
||||
<p>
|
||||
<label><input type="checkbox" checked={state.eager} onchange={this.setEager} />
|
||||
Я нетерпеливый и заменяю отказавший диск сразу, не давая данным уехать на остальные диски
|
||||
(либо данным уезжать некуда, например, сервера всего 3 при 3 репликах)
|
||||
</label>
|
||||
</p>
|
||||
<div style="text-align: center; font-size: 150%; margin: 20px 0; font-weight: bold">
|
||||
Вероятность потери данных в течение года:
|
||||
</div>
|
||||
<div style="text-align: center; font-size: 200%; margin: 10px 0; font-weight: bold">
|
||||
{Math.round(state.result*10000)/10000} %
|
||||
</div>
|
||||
</div>);
|
||||
}
|
||||
}
|
||||
|
||||
preact.render(<Calc />, document.body);
|
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"name": "ceph-afr-calc",
|
||||
"version": "1.0.0",
|
||||
"description": "Ceph/Vitastor cluster failure calculator",
|
||||
"main": "main.js",
|
||||
"scripts": {
|
||||
"build": "webpack",
|
||||
"watch-dev": "NODE_ENV=development webpack --mode=development -w"
|
||||
},
|
||||
"author": "Vitaliy Filippov",
|
||||
"license": "AGPL-3.0",
|
||||
"devDependencies": {
|
||||
"webpack-cli": "^4.3.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"babel-cli": "^6.26.0",
|
||||
"babel-core": "^6.26.3",
|
||||
"babel-loader": "^7.1.4",
|
||||
"babel-preset-env": "^1.7.0",
|
||||
"babel-preset-react": "^6.24.1",
|
||||
"babel-preset-stage-1": "^6.24.1",
|
||||
"preact": "^10.5.10",
|
||||
"webpack": "^4.26.1",
|
||||
"webpack-cli": "^3.0.8"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
const webpack = require('webpack');
|
||||
const path = require('path');
|
||||
|
||||
module.exports = {
|
||||
entry: { 'main': [ './main.js' ] },
|
||||
context: __dirname,
|
||||
output: {
|
||||
path: __dirname,
|
||||
filename: './dist/[name].js'
|
||||
},
|
||||
devtool: 'cheap-module-source-map',
|
||||
module: {
|
||||
rules: [
|
||||
{
|
||||
test: /.jsx?$/,
|
||||
loader: 'babel-loader',
|
||||
exclude: /node_modules/
|
||||
}
|
||||
]
|
||||
},
|
||||
plugins: [
|
||||
new webpack.DefinePlugin({
|
||||
"process.env": {
|
||||
NODE_ENV: JSON.stringify(process.env.NODE_ENV || "production")
|
||||
}
|
||||
})
|
||||
]
|
||||
};
|
Loading…
Reference in New Issue