Compare commits
No commits in common. "cf60b6818c31a891b32ffd2c60972fe04b7b3e32" and "50e56b3b92122424bc57f53a9a2b23bf15147f74" have entirely different histories.
cf60b6818c
...
50e56b3b92
|
@ -1,3 +1,2 @@
|
||||||
mon usr/lib/vitastor/mon
|
mon usr/lib/vitastor
|
||||||
mon/scripts/make-etcd usr/lib/vitastor/mon
|
mon/vitastor-mon.service /lib/systemd/system
|
||||||
mon/scripts/vitastor-mon.service /lib/systemd/system
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
usr/bin/vitastor-osd
|
usr/bin/vitastor-osd
|
||||||
usr/bin/vitastor-disk
|
usr/bin/vitastor-disk
|
||||||
usr/bin/vitastor-dump-journal
|
usr/bin/vitastor-dump-journal
|
||||||
mon/scripts/vitastor-osd@.service /lib/systemd/system
|
mon/vitastor-osd@.service /lib/systemd/system
|
||||||
mon/scripts/vitastor.target /lib/systemd/system
|
mon/vitastor.target /lib/systemd/system
|
||||||
mon/scripts/90-vitastor.rules /lib/udev/rules.d
|
mon/90-vitastor.rules /lib/udev/rules.d
|
||||||
|
|
|
@ -22,7 +22,7 @@
|
||||||
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
|
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
|
||||||
[here](../config/layout-cluster.en.md#immediate_commit).
|
[here](../config/layout-cluster.en.md#immediate_commit).
|
||||||
- If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
|
- If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
|
||||||
Toshiba MG, Seagate EXOS or something similar. If your drives don't have such cache then
|
Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
|
||||||
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
|
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
|
||||||
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
|
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
|
||||||
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
||||||
|
@ -33,7 +33,7 @@
|
||||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||||
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||||
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
|
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
|
||||||
|
|
||||||
## Configure monitors
|
## Configure monitors
|
||||||
|
|
||||||
|
|
|
@ -123,4 +123,4 @@ vitastor-cli create -s 10G testimg
|
||||||
Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
|
Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
|
||||||
а также кластерную файловую систему, то:
|
а также кластерную файловую систему, то:
|
||||||
|
|
||||||
- [Следуйте инструкциям](../usage/nfs.ru.md#vitastorfs)
|
- [Следуйте инструкциям](../usage/nfs.en.md#vitastorfs)
|
||||||
|
|
|
@ -11,7 +11,6 @@ module.exports = {
|
||||||
"ecmaVersion": 2020
|
"ecmaVersion": 2020
|
||||||
},
|
},
|
||||||
"plugins": [
|
"plugins": [
|
||||||
"import"
|
|
||||||
],
|
],
|
||||||
"rules": {
|
"rules": {
|
||||||
"indent": [
|
"indent": [
|
||||||
|
@ -45,10 +44,6 @@ module.exports = {
|
||||||
],
|
],
|
||||||
"node/shebang": [
|
"node/shebang": [
|
||||||
"off"
|
"off"
|
||||||
],
|
|
||||||
"import/no-unresolved": [
|
|
||||||
2,
|
|
||||||
{ "commonjs": true }
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,320 +0,0 @@
|
||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
|
|
||||||
const http = require('http');
|
|
||||||
const WebSocket = require('ws');
|
|
||||||
|
|
||||||
class EtcdAdapter
|
|
||||||
{
|
|
||||||
constructor(mon)
|
|
||||||
{
|
|
||||||
this.mon = mon;
|
|
||||||
this.ws = null;
|
|
||||||
this.ws_alive = false;
|
|
||||||
this.ws_keepalive_timer = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
parse_config(config)
|
|
||||||
{
|
|
||||||
this.parse_etcd_addresses(config.etcd_address||config.etcd_url);
|
|
||||||
}
|
|
||||||
|
|
||||||
parse_etcd_addresses(addrs)
|
|
||||||
{
|
|
||||||
const is_local_ip = this.mon.local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
|
|
||||||
this.etcd_local = [];
|
|
||||||
this.etcd_urls = [];
|
|
||||||
this.selected_etcd_url = null;
|
|
||||||
this.etcd_urls_to_try = [];
|
|
||||||
if (!(addrs instanceof Array))
|
|
||||||
addrs = addrs ? (''+(addrs||'')).split(/,/) : [];
|
|
||||||
if (!addrs.length)
|
|
||||||
{
|
|
||||||
console.error('Vitastor etcd address(es) not specified. Please set on the command line or in the config file');
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
for (let url of addrs)
|
|
||||||
{
|
|
||||||
let scheme = 'http';
|
|
||||||
url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
|
|
||||||
const slash = url.indexOf('/');
|
|
||||||
const colon = url.indexOf(':');
|
|
||||||
const is_local = is_local_ip[colon >= 0 ? url.substr(0, colon) : (slash >= 0 ? url.substr(0, slash) : url)];
|
|
||||||
url = scheme+'://'+(slash >= 0 ? url : url+'/v3');
|
|
||||||
if (is_local)
|
|
||||||
this.etcd_local.push(url);
|
|
||||||
else
|
|
||||||
this.etcd_urls.push(url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pick_next_etcd()
|
|
||||||
{
|
|
||||||
if (this.selected_etcd_url)
|
|
||||||
return this.selected_etcd_url;
|
|
||||||
if (!this.etcd_urls_to_try || !this.etcd_urls_to_try.length)
|
|
||||||
{
|
|
||||||
this.etcd_urls_to_try = [ ...this.etcd_local ];
|
|
||||||
const others = [ ...this.etcd_urls ];
|
|
||||||
while (others.length)
|
|
||||||
{
|
|
||||||
const url = others.splice(0|(others.length*Math.random()), 1);
|
|
||||||
this.etcd_urls_to_try.push(url[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
this.selected_etcd_url = this.etcd_urls_to_try.shift();
|
|
||||||
return this.selected_etcd_url;
|
|
||||||
}
|
|
||||||
|
|
||||||
restart_watcher(cur_addr)
|
|
||||||
{
|
|
||||||
if (this.ws)
|
|
||||||
{
|
|
||||||
this.ws.close();
|
|
||||||
this.ws = null;
|
|
||||||
}
|
|
||||||
if (this.ws_keepalive_timer)
|
|
||||||
{
|
|
||||||
clearInterval(this.ws_keepalive_timer);
|
|
||||||
this.ws_keepalive_timer = null;
|
|
||||||
}
|
|
||||||
if (this.selected_etcd_url == cur_addr)
|
|
||||||
{
|
|
||||||
this.selected_etcd_url = null;
|
|
||||||
}
|
|
||||||
this.start_watcher(this.mon.config.etcd_mon_retries).catch(this.mon.die);
|
|
||||||
}
|
|
||||||
|
|
||||||
async start_watcher(retries)
|
|
||||||
{
|
|
||||||
let retry = 0;
|
|
||||||
if (!retries || retries < 1)
|
|
||||||
{
|
|
||||||
retries = 1;
|
|
||||||
}
|
|
||||||
const tried = {};
|
|
||||||
while (retries < 0 || retry < retries)
|
|
||||||
{
|
|
||||||
const cur_addr = this.pick_next_etcd();
|
|
||||||
const base = 'ws'+cur_addr.substr(4);
|
|
||||||
let now = Date.now();
|
|
||||||
if (tried[base] && now-tried[base] < this.mon.etcd_start_timeout)
|
|
||||||
{
|
|
||||||
await new Promise(ok => setTimeout(ok, this.mon.etcd_start_timeout-(now-tried[base])));
|
|
||||||
now = Date.now();
|
|
||||||
}
|
|
||||||
tried[base] = now;
|
|
||||||
const ok = await new Promise(ok =>
|
|
||||||
{
|
|
||||||
const timer_id = setTimeout(() =>
|
|
||||||
{
|
|
||||||
this.ws.close();
|
|
||||||
this.ws = null;
|
|
||||||
ok(false);
|
|
||||||
}, this.mon.config.etcd_mon_timeout);
|
|
||||||
this.ws = new WebSocket(base+'/watch');
|
|
||||||
const fail = () =>
|
|
||||||
{
|
|
||||||
ok(false);
|
|
||||||
};
|
|
||||||
this.ws.on('error', fail);
|
|
||||||
this.ws.on('open', () =>
|
|
||||||
{
|
|
||||||
this.ws.removeListener('error', fail);
|
|
||||||
if (timer_id)
|
|
||||||
clearTimeout(timer_id);
|
|
||||||
ok(true);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
if (ok)
|
|
||||||
break;
|
|
||||||
if (this.selected_etcd_url == cur_addr)
|
|
||||||
this.selected_etcd_url = null;
|
|
||||||
this.ws = null;
|
|
||||||
retry++;
|
|
||||||
}
|
|
||||||
if (!this.ws)
|
|
||||||
{
|
|
||||||
this.mon.failconnect('Failed to open etcd watch websocket');
|
|
||||||
}
|
|
||||||
const cur_addr = this.selected_etcd_url;
|
|
||||||
this.ws_alive = true;
|
|
||||||
this.ws_keepalive_timer = setInterval(() =>
|
|
||||||
{
|
|
||||||
if (this.ws_alive)
|
|
||||||
{
|
|
||||||
this.ws_alive = false;
|
|
||||||
this.ws.send(JSON.stringify({ progress_request: {} }));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
console.log('etcd websocket timed out, restarting it');
|
|
||||||
this.restart_watcher(cur_addr);
|
|
||||||
}
|
|
||||||
}, (Number(this.mon.config.etcd_ws_keepalive_interval) || 30)*1000);
|
|
||||||
this.ws.on('error', () => this.restart_watcher(cur_addr));
|
|
||||||
this.ws.send(JSON.stringify({
|
|
||||||
create_request: {
|
|
||||||
key: b64(this.mon.etcd_prefix+'/'),
|
|
||||||
range_end: b64(this.mon.etcd_prefix+'0'),
|
|
||||||
start_revision: ''+this.mon.etcd_watch_revision,
|
|
||||||
watch_id: 1,
|
|
||||||
progress_notify: true,
|
|
||||||
},
|
|
||||||
}));
|
|
||||||
this.ws.on('message', (msg) =>
|
|
||||||
{
|
|
||||||
this.ws_alive = true;
|
|
||||||
let data;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
data = JSON.parse(msg);
|
|
||||||
}
|
|
||||||
catch (e)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
if (!data || !data.result)
|
|
||||||
{
|
|
||||||
console.error('Unknown message received from watch websocket: '+msg);
|
|
||||||
}
|
|
||||||
else if (data.result.canceled)
|
|
||||||
{
|
|
||||||
// etcd watch canceled
|
|
||||||
if (data.result.compact_revision)
|
|
||||||
{
|
|
||||||
// we may miss events if we proceed
|
|
||||||
console.error('Revisions before '+data.result.compact_revision+' were compacted by etcd, exiting');
|
|
||||||
this.mon.on_stop(1);
|
|
||||||
}
|
|
||||||
console.error('Watch canceled by etcd, reason: '+data.result.cancel_reason+', exiting');
|
|
||||||
this.mon.on_stop(1);
|
|
||||||
}
|
|
||||||
else if (data.result.created)
|
|
||||||
{
|
|
||||||
// etcd watch created
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
this.mon.on_message(data.result);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
async become_master()
|
|
||||||
{
|
|
||||||
const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
|
|
||||||
// eslint-disable-next-line no-constant-condition
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
const res = await this.etcd_call('/kv/txn', {
|
|
||||||
compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.mon.etcd_prefix+'/mon/master') } ],
|
|
||||||
success: [ { requestPut: { key: b64(this.mon.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.mon.etcd_lease_id } } ],
|
|
||||||
}, this.mon.etcd_start_timeout, 0);
|
|
||||||
if (res.succeeded)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
console.log('Waiting to become master');
|
|
||||||
await new Promise(ok => setTimeout(ok, this.mon.etcd_start_timeout));
|
|
||||||
}
|
|
||||||
console.log('Became master');
|
|
||||||
}
|
|
||||||
|
|
||||||
async etcd_call(path, body, timeout, retries)
|
|
||||||
{
|
|
||||||
let retry = 0;
|
|
||||||
if (retries >= 0 && retries < 1)
|
|
||||||
{
|
|
||||||
retries = 1;
|
|
||||||
}
|
|
||||||
const tried = {};
|
|
||||||
while (retries < 0 || retry < retries)
|
|
||||||
{
|
|
||||||
retry++;
|
|
||||||
const base = this.pick_next_etcd();
|
|
||||||
let now = Date.now();
|
|
||||||
if (tried[base] && now-tried[base] < timeout)
|
|
||||||
{
|
|
||||||
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
|
|
||||||
now = Date.now();
|
|
||||||
}
|
|
||||||
tried[base] = now;
|
|
||||||
const res = await POST(base+path, body, timeout);
|
|
||||||
if (res.error)
|
|
||||||
{
|
|
||||||
if (this.selected_etcd_url == base)
|
|
||||||
this.selected_etcd_url = null;
|
|
||||||
console.error('failed to query etcd: '+res.error);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (res.json)
|
|
||||||
{
|
|
||||||
if (res.json.error)
|
|
||||||
{
|
|
||||||
console.error('etcd returned error: '+res.json.error);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return res.json;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
this.mon.failconnect();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function POST(url, body, timeout)
|
|
||||||
{
|
|
||||||
return new Promise(ok =>
|
|
||||||
{
|
|
||||||
const body_text = Buffer.from(JSON.stringify(body));
|
|
||||||
let timer_id = timeout > 0 ? setTimeout(() =>
|
|
||||||
{
|
|
||||||
if (req)
|
|
||||||
req.abort();
|
|
||||||
req = null;
|
|
||||||
ok({ error: 'timeout' });
|
|
||||||
}, timeout) : null;
|
|
||||||
let req = http.request(url, { method: 'POST', headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Content-Length': body_text.length,
|
|
||||||
} }, (res) =>
|
|
||||||
{
|
|
||||||
if (!req)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
clearTimeout(timer_id);
|
|
||||||
let res_body = '';
|
|
||||||
res.setEncoding('utf8');
|
|
||||||
res.on('error', (error) => ok({ error }));
|
|
||||||
res.on('data', chunk => { res_body += chunk; });
|
|
||||||
res.on('end', () =>
|
|
||||||
{
|
|
||||||
if (res.statusCode != 200)
|
|
||||||
{
|
|
||||||
ok({ error: res_body, code: res.statusCode });
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
try
|
|
||||||
{
|
|
||||||
res_body = JSON.parse(res_body);
|
|
||||||
ok({ response: res, json: res_body });
|
|
||||||
}
|
|
||||||
catch (e)
|
|
||||||
{
|
|
||||||
ok({ error: e, response: res, body: res_body });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
req.on('error', (error) => ok({ error }));
|
|
||||||
req.on('close', () => ok({ error: new Error('Connection closed prematurely') }));
|
|
||||||
req.write(body_text);
|
|
||||||
req.end();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function b64(str)
|
|
||||||
{
|
|
||||||
return Buffer.from(str).toString('base64');
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = EtcdAdapter;
|
|
|
@ -1,391 +0,0 @@
|
||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
|
|
||||||
// FIXME document all etcd keys and config variables in the form of JSON schema or similar
|
|
||||||
const etcd_nonempty_keys = {
|
|
||||||
'config/global': 1,
|
|
||||||
'config/node_placement': 1,
|
|
||||||
'config/pools': 1,
|
|
||||||
'config/pgs': 1,
|
|
||||||
'history/last_clean_pgs': 1,
|
|
||||||
'stats': 1,
|
|
||||||
};
|
|
||||||
const etcd_allow = new RegExp('^'+[
|
|
||||||
'config/global',
|
|
||||||
'config/node_placement',
|
|
||||||
'config/pools',
|
|
||||||
'config/osd/[1-9]\\d*',
|
|
||||||
'config/pgs',
|
|
||||||
'config/inode/[1-9]\\d*/[1-9]\\d*',
|
|
||||||
'osd/state/[1-9]\\d*',
|
|
||||||
'osd/stats/[1-9]\\d*',
|
|
||||||
'osd/inodestats/[1-9]\\d*',
|
|
||||||
'osd/space/[1-9]\\d*',
|
|
||||||
'mon/master',
|
|
||||||
'mon/member/[a-f0-9]+',
|
|
||||||
'pg/state/[1-9]\\d*/[1-9]\\d*',
|
|
||||||
'pg/stats/[1-9]\\d*/[1-9]\\d*',
|
|
||||||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
|
||||||
'history/last_clean_pgs',
|
|
||||||
'inode/stats/[1-9]\\d*/\\d+',
|
|
||||||
'pool/stats/[1-9]\\d*',
|
|
||||||
'stats',
|
|
||||||
'index/image/.*',
|
|
||||||
'index/maxid/[1-9]\\d*',
|
|
||||||
].join('$|^')+'$');
|
|
||||||
|
|
||||||
const etcd_tree = {
|
|
||||||
config: {
|
|
||||||
/* global: {
|
|
||||||
// WARNING: NOT ALL OF THESE ARE ACTUALLY CONFIGURABLE HERE
|
|
||||||
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
|
|
||||||
// etcd connection
|
|
||||||
config_path: "/etc/vitastor/vitastor.conf",
|
|
||||||
etcd_prefix: "/vitastor",
|
|
||||||
// etcd connection - configurable online
|
|
||||||
etcd_address: "10.0.115.10:2379/v3",
|
|
||||||
// mon
|
|
||||||
etcd_mon_ttl: 5, // min: 1
|
|
||||||
etcd_mon_timeout: 1000, // ms. min: 0
|
|
||||||
etcd_mon_retries: 5, // min: 0
|
|
||||||
mon_change_timeout: 1000, // ms. min: 100
|
|
||||||
mon_retry_change_timeout: 50, // ms. min: 10
|
|
||||||
mon_stats_timeout: 1000, // ms. min: 100
|
|
||||||
osd_out_time: 600, // seconds. min: 0
|
|
||||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
|
||||||
use_old_pg_combinator: false,
|
|
||||||
// client and osd
|
|
||||||
tcp_header_buffer_size: 65536,
|
|
||||||
use_sync_send_recv: false,
|
|
||||||
use_rdma: true,
|
|
||||||
rdma_device: null, // for example, "rocep5s0f0"
|
|
||||||
rdma_port_num: 1,
|
|
||||||
rdma_gid_index: 0,
|
|
||||||
rdma_mtu: 4096,
|
|
||||||
rdma_max_sge: 128,
|
|
||||||
rdma_max_send: 8,
|
|
||||||
rdma_max_recv: 16,
|
|
||||||
rdma_max_msg: 132096,
|
|
||||||
block_size: 131072,
|
|
||||||
disk_alignment: 4096,
|
|
||||||
bitmap_granularity: 4096,
|
|
||||||
immediate_commit: false, // 'all' or 'small'
|
|
||||||
// client - configurable online
|
|
||||||
client_max_dirty_bytes: 33554432,
|
|
||||||
client_max_dirty_ops: 1024,
|
|
||||||
client_enable_writeback: false,
|
|
||||||
client_max_buffered_bytes: 33554432,
|
|
||||||
client_max_buffered_ops: 1024,
|
|
||||||
client_max_writeback_iodepth: 256,
|
|
||||||
client_retry_interval: 50, // ms. min: 10
|
|
||||||
client_eio_retry_interval: 1000, // ms
|
|
||||||
client_retry_enospc: true,
|
|
||||||
osd_nearfull_ratio: 0.95,
|
|
||||||
// client and osd - configurable online
|
|
||||||
log_level: 0,
|
|
||||||
peer_connect_interval: 5, // seconds. min: 1
|
|
||||||
peer_connect_timeout: 5, // seconds. min: 1
|
|
||||||
osd_idle_timeout: 5, // seconds. min: 1
|
|
||||||
osd_ping_timeout: 5, // seconds. min: 1
|
|
||||||
max_etcd_attempts: 5,
|
|
||||||
etcd_quick_timeout: 1000, // ms
|
|
||||||
etcd_slow_timeout: 5000, // ms
|
|
||||||
etcd_keepalive_timeout: 30, // seconds, default is max(30, etcd_report_interval*2)
|
|
||||||
etcd_ws_keepalive_interval: 30, // seconds
|
|
||||||
// osd
|
|
||||||
etcd_report_interval: 5, // seconds
|
|
||||||
etcd_stats_interval: 30, // seconds
|
|
||||||
run_primary: true,
|
|
||||||
osd_network: null, // "192.168.7.0/24" or an array of masks
|
|
||||||
bind_address: "0.0.0.0",
|
|
||||||
bind_port: 0,
|
|
||||||
readonly: false,
|
|
||||||
osd_memlock: false,
|
|
||||||
// osd - configurable online
|
|
||||||
autosync_interval: 5,
|
|
||||||
autosync_writes: 128,
|
|
||||||
client_queue_depth: 128, // unused
|
|
||||||
recovery_queue_depth: 1,
|
|
||||||
recovery_sleep_us: 0,
|
|
||||||
recovery_tune_util_low: 0.1,
|
|
||||||
recovery_tune_client_util_low: 0,
|
|
||||||
recovery_tune_util_high: 1.0,
|
|
||||||
recovery_tune_client_util_high: 0.5,
|
|
||||||
recovery_tune_interval: 1,
|
|
||||||
recovery_tune_agg_interval: 10, // 10 times recovery_tune_interval
|
|
||||||
recovery_tune_sleep_min_us: 10, // 10 microseconds
|
|
||||||
recovery_pg_switch: 128,
|
|
||||||
recovery_sync_batch: 16,
|
|
||||||
no_recovery: false,
|
|
||||||
no_rebalance: false,
|
|
||||||
print_stats_interval: 3,
|
|
||||||
slow_log_interval: 10,
|
|
||||||
inode_vanish_time: 60,
|
|
||||||
auto_scrub: false,
|
|
||||||
no_scrub: false,
|
|
||||||
scrub_interval: '30d', // 1s/1m/1h/1d
|
|
||||||
scrub_queue_depth: 1,
|
|
||||||
scrub_sleep: 0, // milliseconds
|
|
||||||
scrub_list_limit: 1000, // objects to list on one scrub iteration
|
|
||||||
scrub_find_best: true,
|
|
||||||
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
|
|
||||||
// blockstore - fixed in superblock
|
|
||||||
block_size,
|
|
||||||
disk_alignment,
|
|
||||||
journal_block_size,
|
|
||||||
meta_block_size,
|
|
||||||
bitmap_granularity,
|
|
||||||
journal_device,
|
|
||||||
journal_offset,
|
|
||||||
journal_size,
|
|
||||||
disable_journal_fsync,
|
|
||||||
data_device,
|
|
||||||
data_offset,
|
|
||||||
data_size,
|
|
||||||
disable_data_fsync,
|
|
||||||
meta_device,
|
|
||||||
meta_offset,
|
|
||||||
disable_meta_fsync,
|
|
||||||
disable_device_lock,
|
|
||||||
// blockstore - configurable offline
|
|
||||||
inmemory_metadata,
|
|
||||||
inmemory_journal,
|
|
||||||
journal_sector_buffer_count,
|
|
||||||
journal_no_same_sector_overwrites,
|
|
||||||
// blockstore - configurable online
|
|
||||||
max_write_iodepth,
|
|
||||||
min_flusher_count: 1,
|
|
||||||
max_flusher_count: 256,
|
|
||||||
throttle_small_writes: false,
|
|
||||||
throttle_target_iops: 100,
|
|
||||||
throttle_target_mbs: 100,
|
|
||||||
throttle_target_parallelism: 1,
|
|
||||||
throttle_threshold_us: 50,
|
|
||||||
}, */
|
|
||||||
global: {},
|
|
||||||
/* node_placement: {
|
|
||||||
host1: { level: 'host', parent: 'rack1' },
|
|
||||||
...
|
|
||||||
}, */
|
|
||||||
node_placement: {},
|
|
||||||
/* pools: {
|
|
||||||
<id>: {
|
|
||||||
name: 'testpool',
|
|
||||||
// 'ec' uses Reed-Solomon-Vandermonde codes, 'jerasure' is an alias for 'ec'
|
|
||||||
scheme: 'replicated' | 'xor' | 'ec' | 'jerasure',
|
|
||||||
pg_size: 3,
|
|
||||||
pg_minsize: 2,
|
|
||||||
// number of parity chunks, required for EC
|
|
||||||
parity_chunks?: 1,
|
|
||||||
pg_count: 100,
|
|
||||||
// default is failure_domain=host
|
|
||||||
failure_domain?: 'host',
|
|
||||||
// additional failure domain rules; failure_domain=x is equivalent to x=123..N
|
|
||||||
level_placement?: 'dc=112233 host=123456',
|
|
||||||
raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
|
|
||||||
old_combinator: false,
|
|
||||||
max_osd_combinations: 10000,
|
|
||||||
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
|
|
||||||
block_size: 131072,
|
|
||||||
bitmap_granularity: 4096,
|
|
||||||
// 'all'/'small'/'none', same as in OSD options
|
|
||||||
immediate_commit: 'none',
|
|
||||||
pg_stripe_size: 0,
|
|
||||||
root_node?: 'rack1',
|
|
||||||
// restrict pool to OSDs having all of these tags
|
|
||||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
|
||||||
// prefer to put primary on OSD with these tags
|
|
||||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
|
||||||
// scrub interval
|
|
||||||
scrub_interval?: '30d',
|
|
||||||
},
|
|
||||||
...
|
|
||||||
}, */
|
|
||||||
pools: {},
|
|
||||||
osd: {
|
|
||||||
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
|
|
||||||
},
|
|
||||||
/* pgs: {
|
|
||||||
hash: string,
|
|
||||||
items: {
|
|
||||||
<pool_id>: {
|
|
||||||
<pg_id>: {
|
|
||||||
osd_set: [ 1, 2, 3 ],
|
|
||||||
primary: 1,
|
|
||||||
pause: false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, */
|
|
||||||
pgs: {},
|
|
||||||
/* inode: {
|
|
||||||
<pool_id>: {
|
|
||||||
<inode_t>: {
|
|
||||||
name: string,
|
|
||||||
size?: uint64_t, // bytes
|
|
||||||
parent_pool?: <pool_id>,
|
|
||||||
parent_id?: <inode_t>,
|
|
||||||
readonly?: boolean,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, */
|
|
||||||
inode: {},
|
|
||||||
},
|
|
||||||
osd: {
|
|
||||||
state: {
|
|
||||||
/* <osd_num_t>: {
|
|
||||||
state: "up",
|
|
||||||
addresses: string[],
|
|
||||||
host: string,
|
|
||||||
port: uint16_t,
|
|
||||||
primary_enabled: boolean,
|
|
||||||
blockstore_enabled: boolean,
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
stats: {
|
|
||||||
/* <osd_num_t>: {
|
|
||||||
time: number, // unix time
|
|
||||||
blockstore_ready: boolean,
|
|
||||||
size: uint64_t, // bytes
|
|
||||||
free: uint64_t, // bytes
|
|
||||||
host: string,
|
|
||||||
op_stats: {
|
|
||||||
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
|
||||||
},
|
|
||||||
subop_stats: {
|
|
||||||
<string>: { count: uint64_t, usec: uint64_t },
|
|
||||||
},
|
|
||||||
recovery_stats: {
|
|
||||||
degraded: { count: uint64_t, bytes: uint64_t },
|
|
||||||
misplaced: { count: uint64_t, bytes: uint64_t },
|
|
||||||
},
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
inodestats: {
|
|
||||||
/* <pool_id>: {
|
|
||||||
<inode_t>: {
|
|
||||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
|
||||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
|
||||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
|
||||||
},
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
space: {
|
|
||||||
/* <osd_num_t>: {
|
|
||||||
<pool_id>: {
|
|
||||||
<inode_t>: uint64_t, // bytes
|
|
||||||
},
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
},
|
|
||||||
mon: {
|
|
||||||
master: {
|
|
||||||
/* ip: [ string ], id: uint64_t */
|
|
||||||
},
|
|
||||||
standby: {
|
|
||||||
/* <uint64_t>: { ip: [ string ] }, */
|
|
||||||
},
|
|
||||||
},
|
|
||||||
pg: {
|
|
||||||
state: {
|
|
||||||
/* <pool_id>: {
|
|
||||||
<pg_id>: {
|
|
||||||
primary: osd_num_t,
|
|
||||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
|
||||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
|
||||||
"has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
|
|
||||||
}
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
stats: {
|
|
||||||
/* <pool_id>: {
|
|
||||||
<pg_id>: {
|
|
||||||
object_count: uint64_t,
|
|
||||||
clean_count: uint64_t,
|
|
||||||
misplaced_count: uint64_t,
|
|
||||||
degraded_count: uint64_t,
|
|
||||||
incomplete_count: uint64_t,
|
|
||||||
write_osd_set: osd_num_t[],
|
|
||||||
},
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
history: {
|
|
||||||
/* <pool_id>: {
|
|
||||||
<pg_id>: {
|
|
||||||
osd_sets: osd_num_t[][],
|
|
||||||
all_peers: osd_num_t[],
|
|
||||||
epoch: uint64_t,
|
|
||||||
next_scrub: uint64_t,
|
|
||||||
},
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
},
|
|
||||||
inode: {
|
|
||||||
stats: {
|
|
||||||
/* <pool_id>: {
|
|
||||||
<inode_t>: {
|
|
||||||
raw_used: uint64_t, // raw used bytes on OSDs
|
|
||||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
|
||||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
|
||||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
|
||||||
},
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
},
|
|
||||||
pool: {
|
|
||||||
stats: {
|
|
||||||
/* <pool_id>: {
|
|
||||||
used_raw_tb: float, // used raw space in the pool
|
|
||||||
total_raw_tb: float, // maximum amount of space in the pool
|
|
||||||
raw_to_usable: float, // raw to usable ratio
|
|
||||||
space_efficiency: float, // 0..1
|
|
||||||
} */
|
|
||||||
},
|
|
||||||
},
|
|
||||||
stats: {
|
|
||||||
/* op_stats: {
|
|
||||||
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
|
||||||
},
|
|
||||||
subop_stats: {
|
|
||||||
<string>: { count: uint64_t, usec: uint64_t, iops: uint64_t, lat: uint64_t },
|
|
||||||
},
|
|
||||||
recovery_stats: {
|
|
||||||
degraded: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
|
|
||||||
misplaced: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
|
|
||||||
},
|
|
||||||
object_counts: {
|
|
||||||
object: uint64_t,
|
|
||||||
clean: uint64_t,
|
|
||||||
misplaced: uint64_t,
|
|
||||||
degraded: uint64_t,
|
|
||||||
incomplete: uint64_t,
|
|
||||||
},
|
|
||||||
object_bytes: {
|
|
||||||
total: uint64_t,
|
|
||||||
clean: uint64_t,
|
|
||||||
misplaced: uint64_t,
|
|
||||||
degraded: uint64_t,
|
|
||||||
incomplete: uint64_t,
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
history: {
|
|
||||||
last_clean_pgs: {},
|
|
||||||
},
|
|
||||||
index: {
|
|
||||||
image: {
|
|
||||||
/* <name>: {
|
|
||||||
id: uint64_t,
|
|
||||||
pool_id: uint64_t,
|
|
||||||
}, */
|
|
||||||
},
|
|
||||||
maxid: {
|
|
||||||
/* <pool_id>: uint64_t, */
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
etcd_nonempty_keys,
|
|
||||||
etcd_allow,
|
|
||||||
etcd_tree,
|
|
||||||
};
|
|
1702
mon/mon.js
1702
mon/mon.js
File diff suppressed because it is too large
Load Diff
215
mon/osd_tree.js
215
mon/osd_tree.js
|
@ -1,215 +0,0 @@
|
||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
|
|
||||||
function get_osd_tree(global_config, state)
|
|
||||||
{
|
|
||||||
const levels = global_config.placement_levels||{};
|
|
||||||
levels.host = levels.host || 100;
|
|
||||||
levels.osd = levels.osd || 101;
|
|
||||||
const tree = {};
|
|
||||||
let up_osds = {};
|
|
||||||
// This requires monitor system time to be in sync with OSD system times (at least to some extent)
|
|
||||||
const down_time = Date.now()/1000 - global_config.osd_out_time;
|
|
||||||
for (const osd_num of Object.keys(state.osd.stats).sort((a, b) => a - b))
|
|
||||||
{
|
|
||||||
const stat = state.osd.stats[osd_num];
|
|
||||||
const osd_cfg = state.config.osd[osd_num];
|
|
||||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
|
||||||
if (reweight < 0 || isNaN(reweight))
|
|
||||||
reweight = 1;
|
|
||||||
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
|
||||||
osd_cfg && osd_cfg.noout))
|
|
||||||
{
|
|
||||||
// Numeric IDs are reserved for OSDs
|
|
||||||
if (state.osd.state[osd_num] && reweight > 0)
|
|
||||||
{
|
|
||||||
// React to down OSDs immediately
|
|
||||||
up_osds[osd_num] = true;
|
|
||||||
}
|
|
||||||
tree[osd_num] = tree[osd_num] || {};
|
|
||||||
tree[osd_num].id = osd_num;
|
|
||||||
tree[osd_num].parent = tree[osd_num].parent || stat.host;
|
|
||||||
tree[osd_num].level = 'osd';
|
|
||||||
tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
|
|
||||||
if (osd_cfg && osd_cfg.tags)
|
|
||||||
{
|
|
||||||
tree[osd_num].tags = (osd_cfg.tags instanceof Array ? [ ...osd_cfg.tags ] : [ osd_cfg.tags ])
|
|
||||||
.reduce((a, c) => { a[c] = true; return a; }, {});
|
|
||||||
}
|
|
||||||
delete tree[osd_num].children;
|
|
||||||
if (!tree[stat.host])
|
|
||||||
{
|
|
||||||
tree[stat.host] = {
|
|
||||||
id: stat.host,
|
|
||||||
level: 'host',
|
|
||||||
parent: null,
|
|
||||||
children: [],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (const node_id in state.config.node_placement||{})
|
|
||||||
{
|
|
||||||
const node_cfg = state.config.node_placement[node_id];
|
|
||||||
if (/^\d+$/.exec(node_id))
|
|
||||||
{
|
|
||||||
node_cfg.level = 'osd';
|
|
||||||
}
|
|
||||||
if (!node_id || !node_cfg.level || !levels[node_cfg.level] ||
|
|
||||||
node_cfg.level === 'osd' && !tree[node_id])
|
|
||||||
{
|
|
||||||
// All nodes must have non-empty IDs and valid levels
|
|
||||||
// OSDs have to actually exist
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
tree[node_id] = tree[node_id] || {};
|
|
||||||
tree[node_id].id = node_id;
|
|
||||||
tree[node_id].level = node_cfg.level;
|
|
||||||
tree[node_id].parent = node_cfg.parent;
|
|
||||||
if (node_cfg.level !== 'osd')
|
|
||||||
{
|
|
||||||
tree[node_id].children = [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return { up_osds, levels, osd_tree: tree };
|
|
||||||
}
|
|
||||||
|
|
||||||
function make_hier_tree(global_config, tree)
|
|
||||||
{
|
|
||||||
const levels = global_config.placement_levels||{};
|
|
||||||
levels.host = levels.host || 100;
|
|
||||||
levels.osd = levels.osd || 101;
|
|
||||||
tree = { ...tree };
|
|
||||||
for (const node_id in tree)
|
|
||||||
{
|
|
||||||
tree[node_id] = { ...tree[node_id], children: [] };
|
|
||||||
}
|
|
||||||
tree[''] = { children: [] };
|
|
||||||
for (const node_id in tree)
|
|
||||||
{
|
|
||||||
if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const node_cfg = tree[node_id];
|
|
||||||
const node_level = levels[node_cfg.level] || node_cfg.level;
|
|
||||||
let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
|
|
||||||
&& tree[node_cfg.parent].level;
|
|
||||||
parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
|
|
||||||
// Parent's level must be less than child's; OSDs must be leaves
|
|
||||||
const parent = parent_level && parent_level < node_level ? node_cfg.parent : '';
|
|
||||||
tree[parent].children.push(tree[node_id]);
|
|
||||||
}
|
|
||||||
// Delete empty nodes
|
|
||||||
let deleted = 0;
|
|
||||||
do
|
|
||||||
{
|
|
||||||
deleted = 0;
|
|
||||||
for (const node_id in tree)
|
|
||||||
{
|
|
||||||
if (tree[node_id].level !== 'osd' && (!tree[node_id].children || !tree[node_id].children.length))
|
|
||||||
{
|
|
||||||
const parent = tree[node_id].parent;
|
|
||||||
if (parent)
|
|
||||||
{
|
|
||||||
tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
|
|
||||||
}
|
|
||||||
deleted++;
|
|
||||||
delete tree[node_id];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} while (deleted > 0);
|
|
||||||
return tree;
|
|
||||||
}
|
|
||||||
|
|
||||||
function filter_osds_by_root_node(global_config, pool_tree, root_node)
|
|
||||||
{
|
|
||||||
if (!root_node)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let hier_tree = make_hier_tree(global_config, pool_tree);
|
|
||||||
let included = [ ...(hier_tree[root_node] || {}).children||[] ];
|
|
||||||
for (let i = 0; i < included.length; i++)
|
|
||||||
{
|
|
||||||
if (included[i].children)
|
|
||||||
{
|
|
||||||
included.splice(i+1, 0, ...included[i].children);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let cur = pool_tree[root_node] || {};
|
|
||||||
while (cur && cur.id)
|
|
||||||
{
|
|
||||||
included.unshift(cur);
|
|
||||||
cur = pool_tree[cur.parent||''];
|
|
||||||
}
|
|
||||||
included = included.reduce((a, c) => { a[c.id||''] = true; return a; }, {});
|
|
||||||
for (const item in pool_tree)
|
|
||||||
{
|
|
||||||
if (!included[item])
|
|
||||||
{
|
|
||||||
delete pool_tree[item];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function filter_osds_by_tags(orig_tree, tags)
|
|
||||||
{
|
|
||||||
if (!tags)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (const tag of (tags instanceof Array ? tags : [ tags ]))
|
|
||||||
{
|
|
||||||
for (const osd in orig_tree)
|
|
||||||
{
|
|
||||||
if (orig_tree[osd].level === 'osd' &&
|
|
||||||
(!orig_tree[osd].tags || !orig_tree[osd].tags[tag]))
|
|
||||||
{
|
|
||||||
delete orig_tree[osd];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function filter_osds_by_block_layout(orig_tree, osd_stats, block_size, bitmap_granularity, immediate_commit)
|
|
||||||
{
|
|
||||||
for (const osd in orig_tree)
|
|
||||||
{
|
|
||||||
if (orig_tree[osd].level === 'osd')
|
|
||||||
{
|
|
||||||
const osd_stat = osd_stats[osd];
|
|
||||||
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
|
|
||||||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
|
|
||||||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
|
|
||||||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
|
|
||||||
{
|
|
||||||
delete orig_tree[osd];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function get_affinity_osds(pool_cfg, up_osds, osd_tree)
|
|
||||||
{
|
|
||||||
let aff_osds = up_osds;
|
|
||||||
if (pool_cfg.primary_affinity_tags)
|
|
||||||
{
|
|
||||||
aff_osds = Object.keys(up_osds).reduce((a, c) => { a[c] = osd_tree[c]; return a; }, {});
|
|
||||||
filter_osds_by_tags(aff_osds, pool_cfg.primary_affinity_tags);
|
|
||||||
for (const osd in aff_osds)
|
|
||||||
{
|
|
||||||
aff_osds[osd] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return aff_osds;
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
get_osd_tree,
|
|
||||||
make_hier_tree,
|
|
||||||
filter_osds_by_root_node,
|
|
||||||
filter_osds_by_tags,
|
|
||||||
filter_osds_by_block_layout,
|
|
||||||
get_affinity_osds,
|
|
||||||
};
|
|
|
@ -4,7 +4,7 @@
|
||||||
"description": "Vitastor SDS monitor service",
|
"description": "Vitastor SDS monitor service",
|
||||||
"main": "mon-main.js",
|
"main": "mon-main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"lint": "eslint *.js lp_optimizer/*.js scripts/*.js"
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
},
|
},
|
||||||
"author": "Vitaliy Filippov",
|
"author": "Vitaliy Filippov",
|
||||||
"license": "UNLICENSED",
|
"license": "UNLICENSED",
|
||||||
|
@ -14,10 +14,12 @@
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"eslint": "^8.0.0",
|
"eslint": "^8.0.0",
|
||||||
"eslint-plugin-import": "^2.29.1",
|
|
||||||
"eslint-plugin-node": "^11.1.0"
|
"eslint-plugin-node": "^11.1.0"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=12.0.0"
|
"node": ">=12.0.0"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"lint": "eslint *.js"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
267
mon/pg_gen.js
267
mon/pg_gen.js
|
@ -1,267 +0,0 @@
|
||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
|
|
||||||
const { RuleCombinator } = require('./lp_optimizer/dsl_pgs.js');
|
|
||||||
const { SimpleCombinator, flatten_tree } = require('./lp_optimizer/simple_pgs.js');
|
|
||||||
const { validate_pool_cfg, get_pg_rules } = require('./pool_config.js');
|
|
||||||
const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
|
|
||||||
const { scale_pg_count } = require('./pg_utils.js');
|
|
||||||
const { make_hier_tree, filter_osds_by_root_node,
|
|
||||||
filter_osds_by_tags, filter_osds_by_block_layout, get_affinity_osds } = require('./osd_tree.js');
|
|
||||||
|
|
||||||
let seed;
|
|
||||||
|
|
||||||
function reset_rng()
|
|
||||||
{
|
|
||||||
seed = 0x5f020e43;
|
|
||||||
}
|
|
||||||
|
|
||||||
function rng()
|
|
||||||
{
|
|
||||||
seed ^= seed << 13;
|
|
||||||
seed ^= seed >> 17;
|
|
||||||
seed ^= seed << 5;
|
|
||||||
return seed + 2147483648;
|
|
||||||
}
|
|
||||||
|
|
||||||
function pick_primary(pool_config, osd_set, up_osds, aff_osds)
|
|
||||||
{
|
|
||||||
let alive_set;
|
|
||||||
if (pool_config.scheme === 'replicated')
|
|
||||||
{
|
|
||||||
// Prefer "affinity" OSDs
|
|
||||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
|
||||||
if (!alive_set.length)
|
|
||||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Prefer data OSDs for EC because they can actually read something without an additional network hop
|
|
||||||
const pg_data_size = (pool_config.pg_size||0) - (pool_config.parity_chunks||0);
|
|
||||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]);
|
|
||||||
if (!alive_set.length)
|
|
||||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
|
||||||
if (!alive_set.length)
|
|
||||||
{
|
|
||||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
|
|
||||||
if (!alive_set.length)
|
|
||||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!alive_set.length)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return alive_set[rng() % alive_set.length];
|
|
||||||
}
|
|
||||||
|
|
||||||
function recheck_primary(state, global_config, up_osds, osd_tree)
|
|
||||||
{
|
|
||||||
let new_config_pgs;
|
|
||||||
for (const pool_id in state.config.pools)
|
|
||||||
{
|
|
||||||
const pool_cfg = state.config.pools[pool_id];
|
|
||||||
if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const aff_osds = get_affinity_osds(pool_cfg, up_osds, osd_tree);
|
|
||||||
reset_rng();
|
|
||||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
|
||||||
{
|
|
||||||
if (!state.config.pgs.items[pool_id])
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const pg_cfg = state.config.pgs.items[pool_id][pg_num];
|
|
||||||
if (pg_cfg)
|
|
||||||
{
|
|
||||||
const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
|
|
||||||
if (pg_cfg.primary != new_primary)
|
|
||||||
{
|
|
||||||
if (!new_config_pgs)
|
|
||||||
{
|
|
||||||
new_config_pgs = JSON.parse(JSON.stringify(state.config.pgs));
|
|
||||||
}
|
|
||||||
console.log(
|
|
||||||
`Moving pool ${pool_id} (${pool_cfg.name || 'unnamed'}) PG ${pg_num}`+
|
|
||||||
` primary OSD from ${pg_cfg.primary} to ${new_primary}`
|
|
||||||
);
|
|
||||||
new_config_pgs.items[pool_id][pg_num].primary = new_primary;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new_config_pgs;
|
|
||||||
}
|
|
||||||
|
|
||||||
function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
|
|
||||||
{
|
|
||||||
const aff_osds = get_affinity_osds(state.config.pools[pool_id] || {}, up_osds, osd_tree);
|
|
||||||
const pg_items = {};
|
|
||||||
reset_rng();
|
|
||||||
new_pgs.map((osd_set, i) =>
|
|
||||||
{
|
|
||||||
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
|
|
||||||
pg_items[i+1] = {
|
|
||||||
osd_set,
|
|
||||||
primary: pick_primary(state.config.pools[pool_id], osd_set, up_osds, aff_osds),
|
|
||||||
};
|
|
||||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
|
||||||
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
|
||||||
{
|
|
||||||
pg_history[i] = pg_history[i] || {};
|
|
||||||
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
|
|
||||||
pg_history[i].osd_sets.push(prev_pgs[i]);
|
|
||||||
}
|
|
||||||
if (pg_history[i] && pg_history[i].osd_sets)
|
|
||||||
{
|
|
||||||
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
|
|
||||||
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
|
|
||||||
{
|
|
||||||
// FIXME: etcd has max_txn_ops limit, and it's 128 by default
|
|
||||||
// Sooo we probably want to change our storage scheme for PG histories...
|
|
||||||
request.compare.push({
|
|
||||||
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
|
|
||||||
target: 'MOD',
|
|
||||||
mod_revision: ''+etcd_watch_revision,
|
|
||||||
result: 'LESS',
|
|
||||||
});
|
|
||||||
if (pg_history[i])
|
|
||||||
{
|
|
||||||
request.success.push({
|
|
||||||
requestPut: {
|
|
||||||
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
|
|
||||||
value: b64(JSON.stringify(pg_history[i])),
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
request.success.push({
|
|
||||||
requestDeleteRange: {
|
|
||||||
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
save_to.items = save_to.items || {};
|
|
||||||
if (!new_pgs.length)
|
|
||||||
{
|
|
||||||
delete save_to.items[pool_id];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
save_to.items[pool_id] = pg_items;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels)
|
|
||||||
{
|
|
||||||
const pool_cfg = state.config.pools[pool_id];
|
|
||||||
if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
let pool_tree = { ...osd_tree };
|
|
||||||
filter_osds_by_root_node(global_config, pool_tree, pool_cfg.root_node);
|
|
||||||
filter_osds_by_tags(pool_tree, pool_cfg.osd_tags);
|
|
||||||
filter_osds_by_block_layout(
|
|
||||||
pool_tree,
|
|
||||||
state.osd.stats,
|
|
||||||
pool_cfg.block_size || global_config.block_size || 131072,
|
|
||||||
pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
|
|
||||||
pool_cfg.immediate_commit || global_config.immediate_commit || 'none'
|
|
||||||
);
|
|
||||||
pool_tree = make_hier_tree(global_config, pool_tree);
|
|
||||||
// First try last_clean_pgs to minimize data movement
|
|
||||||
let prev_pgs = [];
|
|
||||||
for (const pg in ((state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
|
||||||
{
|
|
||||||
prev_pgs[pg-1] = [ ...state.history.last_clean_pgs.items[pool_id][pg].osd_set ];
|
|
||||||
}
|
|
||||||
if (!prev_pgs.length)
|
|
||||||
{
|
|
||||||
// Fall back to config/pgs if it's empty
|
|
||||||
for (const pg in ((state.config.pgs.items||{})[pool_id]||{}))
|
|
||||||
{
|
|
||||||
prev_pgs[pg-1] = [ ...state.config.pgs.items[pool_id][pg].osd_set ];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const old_pg_count = prev_pgs.length;
|
|
||||||
const optimize_cfg = {
|
|
||||||
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
|
|
||||||
combinator: !global_config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
|
|
||||||
// new algorithm:
|
|
||||||
? new RuleCombinator(pool_tree, get_pg_rules(pool_id, pool_cfg, global_config.placement_levels), pool_cfg.max_osd_combinations)
|
|
||||||
// old algorithm:
|
|
||||||
: new SimpleCombinator(flatten_tree(pool_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
|
|
||||||
pg_count: pool_cfg.pg_count,
|
|
||||||
pg_size: pool_cfg.pg_size,
|
|
||||||
pg_minsize: pool_cfg.pg_minsize,
|
|
||||||
ordered: pool_cfg.scheme != 'replicated',
|
|
||||||
};
|
|
||||||
let optimize_result;
|
|
||||||
// Re-shuffle PGs if config/pgs.hash is empty
|
|
||||||
if (old_pg_count > 0 && state.config.pgs.hash)
|
|
||||||
{
|
|
||||||
if (prev_pgs.length != pool_cfg.pg_count)
|
|
||||||
{
|
|
||||||
// Scale PG count
|
|
||||||
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
|
|
||||||
// because last_clean_pgs may still contain the old number of PGs
|
|
||||||
scale_pg_count(prev_pgs, pool_cfg.pg_count);
|
|
||||||
}
|
|
||||||
for (const pg of prev_pgs)
|
|
||||||
{
|
|
||||||
while (pg.length < pool_cfg.pg_size)
|
|
||||||
{
|
|
||||||
pg.push(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
optimize_result = await LPOptimizer.optimize_change({
|
|
||||||
prev_pgs,
|
|
||||||
...optimize_cfg,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
|
||||||
}
|
|
||||||
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
|
|
||||||
LPOptimizer.print_change_stats(optimize_result);
|
|
||||||
let pg_effsize = pool_cfg.pg_size;
|
|
||||||
for (const pg of optimize_result.int_pgs)
|
|
||||||
{
|
|
||||||
const this_pg_size = pg.filter(osd => osd != LPOptimizer.NO_OSD).length;
|
|
||||||
if (this_pg_size && this_pg_size < pg_effsize)
|
|
||||||
{
|
|
||||||
pg_effsize = this_pg_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
pool_id,
|
|
||||||
pgs: optimize_result.int_pgs,
|
|
||||||
stats: {
|
|
||||||
total_raw_tb: optimize_result.space,
|
|
||||||
pg_real_size: pg_effsize || pool_cfg.pg_size,
|
|
||||||
raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated'
|
|
||||||
? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
|
|
||||||
space_efficiency: optimize_result.space/(optimize_result.total_space||1),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function b64(str)
|
|
||||||
{
|
|
||||||
return Buffer.from(str).toString('base64');
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
recheck_primary,
|
|
||||||
save_new_pgs_txn,
|
|
||||||
generate_pool_pgs,
|
|
||||||
};
|
|
|
@ -1,169 +0,0 @@
|
||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
|
|
||||||
const { parse_level_indexes, parse_pg_dsl } = require('./lp_optimizer/dsl_pgs.js');
|
|
||||||
|
|
||||||
function validate_pool_cfg(pool_id, pool_cfg, placement_levels, warn)
|
|
||||||
{
|
|
||||||
pool_cfg.pg_size = Math.floor(pool_cfg.pg_size);
|
|
||||||
pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
|
|
||||||
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
|
|
||||||
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
|
|
||||||
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
|
|
||||||
if (!/^[1-9]\d*$/.exec(''+pool_id))
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool ID '+pool_id+' is invalid');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' &&
|
|
||||||
pool_cfg.scheme !== 'ec' && pool_cfg.scheme !== 'jerasure')
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated", "ec" and "jerasure" required)');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
|
|
||||||
pool_cfg.scheme !== 'replicated' && pool_cfg.pg_size < 3)
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid pg_size');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!pool_cfg.pg_minsize || pool_cfg.pg_minsize < 1 || pool_cfg.pg_minsize > pool_cfg.pg_size ||
|
|
||||||
pool_cfg.scheme === 'xor' && pool_cfg.pg_minsize < (pool_cfg.pg_size - 1))
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid pg_minsize');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (pool_cfg.scheme === 'xor' && pool_cfg.parity_chunks != 0 && pool_cfg.parity_chunks != 1)
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if ((pool_cfg.scheme === 'ec' || pool_cfg.scheme === 'jerasure') &&
|
|
||||||
(pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!pool_cfg.pg_count || pool_cfg.pg_count < 1)
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid pg_count');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!pool_cfg.name)
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has empty name');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (pool_cfg.max_osd_combinations < 100)
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (pool_cfg.root_node && typeof(pool_cfg.root_node) != 'string')
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid root_node (must be a string)');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (pool_cfg.osd_tags && typeof(pool_cfg.osd_tags) != 'string' &&
|
|
||||||
(!(pool_cfg.osd_tags instanceof Array) || pool_cfg.osd_tags.filter(t => typeof t != 'string').length > 0))
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' &&
|
|
||||||
(!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0))
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!get_pg_rules(pool_id, pool_cfg, placement_levels, true))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
function get_pg_rules(pool_id, pool_cfg, placement_levels, warn)
|
|
||||||
{
|
|
||||||
if (pool_cfg.level_placement)
|
|
||||||
{
|
|
||||||
const pg_size = (0|pool_cfg.pg_size);
|
|
||||||
let rules = pool_cfg.level_placement;
|
|
||||||
if (typeof rules === 'string')
|
|
||||||
{
|
|
||||||
rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
rules = { ...rules };
|
|
||||||
}
|
|
||||||
// Always add failure_domain to prevent rules from being totally incorrect
|
|
||||||
const all_diff = [];
|
|
||||||
for (let i = 1; i <= pg_size; i++)
|
|
||||||
{
|
|
||||||
all_diff.push(i);
|
|
||||||
}
|
|
||||||
rules[pool_cfg.failure_domain || 'host'] = all_diff;
|
|
||||||
placement_levels = placement_levels||{};
|
|
||||||
placement_levels.host = placement_levels.host || 100;
|
|
||||||
placement_levels.osd = placement_levels.osd || 101;
|
|
||||||
for (const k in rules)
|
|
||||||
{
|
|
||||||
if (!placement_levels[k] || typeof rules[k] !== 'string' &&
|
|
||||||
(!(rules[k] instanceof Array) ||
|
|
||||||
rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
else if (rules[k].length != pg_size)
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return parse_level_indexes(rules);
|
|
||||||
}
|
|
||||||
else if (typeof pool_cfg.raw_placement === 'string')
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
return parse_pg_dsl(pool_cfg.raw_placement);
|
|
||||||
}
|
|
||||||
catch (e)
|
|
||||||
{
|
|
||||||
if (warn)
|
|
||||||
console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
let rules = [ [] ];
|
|
||||||
let prev = [ 1 ];
|
|
||||||
for (let i = 1; i < pool_cfg.pg_size; i++)
|
|
||||||
{
|
|
||||||
rules.push([ [ pool_cfg.failure_domain||'host', '!=', prev ] ]);
|
|
||||||
prev = [ ...prev, i+1 ];
|
|
||||||
}
|
|
||||||
return rules;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
validate_pool_cfg,
|
|
||||||
get_pg_rules,
|
|
||||||
};
|
|
286
mon/stats.js
286
mon/stats.js
|
@ -1,286 +0,0 @@
|
||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
|
|
||||||
function derive_osd_stats(st, prev, prev_diff)
|
|
||||||
{
|
|
||||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
|
||||||
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
|
|
||||||
{
|
|
||||||
return prev_diff || diff;
|
|
||||||
}
|
|
||||||
const timediff = BigInt(st.time*1000 - prev.time*1000);
|
|
||||||
for (const op in st.op_stats||{})
|
|
||||||
{
|
|
||||||
const pr = prev && prev.op_stats && prev.op_stats[op];
|
|
||||||
let c = st.op_stats[op];
|
|
||||||
c = { bytes: BigInt(c.bytes||0), usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
|
|
||||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
|
||||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
|
||||||
const n = c.count - BigInt(pr && pr.count||0);
|
|
||||||
if (n > 0)
|
|
||||||
diff.op_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff, lat: us/n };
|
|
||||||
}
|
|
||||||
for (const op in st.subop_stats||{})
|
|
||||||
{
|
|
||||||
const pr = prev && prev.subop_stats && prev.subop_stats[op];
|
|
||||||
let c = st.subop_stats[op];
|
|
||||||
c = { usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
|
|
||||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
|
||||||
const n = c.count - BigInt(pr && pr.count||0);
|
|
||||||
if (n > 0)
|
|
||||||
diff.subop_stats[op] = { ...c, iops: n*1000n/timediff, lat: us/n };
|
|
||||||
}
|
|
||||||
for (const op in st.recovery_stats||{})
|
|
||||||
{
|
|
||||||
const pr = prev && prev.recovery_stats && prev.recovery_stats[op];
|
|
||||||
let c = st.recovery_stats[op];
|
|
||||||
c = { bytes: BigInt(c.bytes||0), count: BigInt(c.count||0) };
|
|
||||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
|
||||||
const n = c.count - BigInt(pr && pr.count||0);
|
|
||||||
if (n > 0)
|
|
||||||
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
|
|
||||||
}
|
|
||||||
for (const pool_id in st.inode_stats||{})
|
|
||||||
{
|
|
||||||
diff.inode_stats[pool_id] = {};
|
|
||||||
for (const inode_num in st.inode_stats[pool_id])
|
|
||||||
{
|
|
||||||
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
|
|
||||||
for (const op of [ 'read', 'write', 'delete' ])
|
|
||||||
{
|
|
||||||
const c = st.inode_stats[pool_id][inode_num][op];
|
|
||||||
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
|
|
||||||
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
|
|
||||||
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
|
|
||||||
inode_diff[op] = {
|
|
||||||
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
|
|
||||||
iops: n*1000n/timediff,
|
|
||||||
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return diff;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum_op_stats(this.state.osd, this.prev_stats)
|
|
||||||
function sum_op_stats(all_osd, prev_stats)
|
|
||||||
{
|
|
||||||
for (const osd in all_osd.stats)
|
|
||||||
{
|
|
||||||
const cur = { ...all_osd.stats[osd], inode_stats: all_osd.inodestats[osd]||{} };
|
|
||||||
prev_stats.osd_diff[osd] = derive_osd_stats(
|
|
||||||
cur, prev_stats.osd_stats[osd], prev_stats.osd_diff[osd]
|
|
||||||
);
|
|
||||||
prev_stats.osd_stats[osd] = cur;
|
|
||||||
}
|
|
||||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
|
||||||
// Sum derived values instead of deriving summed
|
|
||||||
for (const osd in all_osd.state)
|
|
||||||
{
|
|
||||||
const derived = prev_stats.osd_diff[osd];
|
|
||||||
if (!all_osd.state[osd] || !derived)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (const type in sum_diff)
|
|
||||||
{
|
|
||||||
for (const op in derived[type]||{})
|
|
||||||
{
|
|
||||||
for (const k in derived[type][op])
|
|
||||||
{
|
|
||||||
sum_diff[type][op] = sum_diff[type][op] || {};
|
|
||||||
sum_diff[type][op][k] = (sum_diff[type][op][k] || 0n) + derived[type][op][k];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sum_diff;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum_object_counts(this.state, this.config)
|
|
||||||
function sum_object_counts(state, global_config)
|
|
||||||
{
|
|
||||||
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
|
|
||||||
const object_bytes = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
|
|
||||||
for (const pool_id in state.pg.stats)
|
|
||||||
{
|
|
||||||
let object_size = 0;
|
|
||||||
for (const osd_num of state.pg.stats[pool_id].write_osd_set||[])
|
|
||||||
{
|
|
||||||
if (osd_num && state.osd.stats[osd_num] && state.osd.stats[osd_num].block_size)
|
|
||||||
{
|
|
||||||
object_size = state.osd.stats[osd_num].block_size;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const pool_cfg = (state.config.pools[pool_id]||{});
|
|
||||||
if (!object_size)
|
|
||||||
{
|
|
||||||
object_size = pool_cfg.block_size || global_config.block_size || 131072;
|
|
||||||
}
|
|
||||||
if (pool_cfg.scheme !== 'replicated')
|
|
||||||
{
|
|
||||||
object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
|
|
||||||
}
|
|
||||||
object_size = BigInt(object_size);
|
|
||||||
for (const pg_num in state.pg.stats[pool_id])
|
|
||||||
{
|
|
||||||
const st = state.pg.stats[pool_id][pg_num];
|
|
||||||
if (st)
|
|
||||||
{
|
|
||||||
for (const k in object_counts)
|
|
||||||
{
|
|
||||||
if (st[k+'_count'])
|
|
||||||
{
|
|
||||||
object_counts[k] += BigInt(st[k+'_count']);
|
|
||||||
object_bytes[k] += BigInt(st[k+'_count']) * object_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return { object_counts, object_bytes };
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum_inode_stats(this.state, this.prev_stats)
|
|
||||||
function sum_inode_stats(state, prev_stats)
|
|
||||||
{
|
|
||||||
const inode_stats = {};
|
|
||||||
const inode_stub = () => ({
|
|
||||||
raw_used: 0n,
|
|
||||||
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
|
||||||
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
|
||||||
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
|
||||||
});
|
|
||||||
const seen_pools = {};
|
|
||||||
for (const pool_id in state.config.pools)
|
|
||||||
{
|
|
||||||
seen_pools[pool_id] = true;
|
|
||||||
state.pool.stats[pool_id] = state.pool.stats[pool_id] || {};
|
|
||||||
state.pool.stats[pool_id].used_raw_tb = 0n;
|
|
||||||
}
|
|
||||||
for (const osd_num in state.osd.space)
|
|
||||||
{
|
|
||||||
for (const pool_id in state.osd.space[osd_num])
|
|
||||||
{
|
|
||||||
state.pool.stats[pool_id] = state.pool.stats[pool_id] || {};
|
|
||||||
if (!seen_pools[pool_id])
|
|
||||||
{
|
|
||||||
state.pool.stats[pool_id].used_raw_tb = 0n;
|
|
||||||
seen_pools[pool_id] = true;
|
|
||||||
}
|
|
||||||
inode_stats[pool_id] = inode_stats[pool_id] || {};
|
|
||||||
for (const inode_num in state.osd.space[osd_num][pool_id])
|
|
||||||
{
|
|
||||||
const u = BigInt(state.osd.space[osd_num][pool_id][inode_num]||0);
|
|
||||||
if (inode_num)
|
|
||||||
{
|
|
||||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
|
||||||
inode_stats[pool_id][inode_num].raw_used += u;
|
|
||||||
}
|
|
||||||
state.pool.stats[pool_id].used_raw_tb += u;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (const pool_id in seen_pools)
|
|
||||||
{
|
|
||||||
const used = state.pool.stats[pool_id].used_raw_tb;
|
|
||||||
state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
|
|
||||||
}
|
|
||||||
for (const osd_num in state.osd.state)
|
|
||||||
{
|
|
||||||
const ist = state.osd.inodestats[osd_num];
|
|
||||||
if (!ist || !state.osd.state[osd_num])
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (const pool_id in ist)
|
|
||||||
{
|
|
||||||
inode_stats[pool_id] = inode_stats[pool_id] || {};
|
|
||||||
for (const inode_num in ist[pool_id])
|
|
||||||
{
|
|
||||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
|
||||||
for (const op of [ 'read', 'write', 'delete' ])
|
|
||||||
{
|
|
||||||
inode_stats[pool_id][inode_num][op].count += BigInt(ist[pool_id][inode_num][op].count||0);
|
|
||||||
inode_stats[pool_id][inode_num][op].usec += BigInt(ist[pool_id][inode_num][op].usec||0);
|
|
||||||
inode_stats[pool_id][inode_num][op].bytes += BigInt(ist[pool_id][inode_num][op].bytes||0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (const osd in state.osd.state)
|
|
||||||
{
|
|
||||||
const osd_diff = prev_stats.osd_diff[osd];
|
|
||||||
if (!osd_diff || !state.osd.state[osd])
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (const pool_id in osd_diff.inode_stats)
|
|
||||||
{
|
|
||||||
for (const inode_num in prev_stats.osd_diff[osd].inode_stats[pool_id])
|
|
||||||
{
|
|
||||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
|
||||||
for (const op of [ 'read', 'write', 'delete' ])
|
|
||||||
{
|
|
||||||
const op_diff = prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
|
|
||||||
const op_st = inode_stats[pool_id][inode_num][op];
|
|
||||||
op_st.bps += op_diff.bps;
|
|
||||||
op_st.iops += op_diff.iops;
|
|
||||||
op_st.lat += op_diff.lat;
|
|
||||||
op_st.n_osd = (op_st.n_osd || 0) + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (const pool_id in inode_stats)
|
|
||||||
{
|
|
||||||
for (const inode_num in inode_stats[pool_id])
|
|
||||||
{
|
|
||||||
let nonzero = inode_stats[pool_id][inode_num].raw_used > 0;
|
|
||||||
for (const op of [ 'read', 'write', 'delete' ])
|
|
||||||
{
|
|
||||||
const op_st = inode_stats[pool_id][inode_num][op];
|
|
||||||
if (op_st.n_osd)
|
|
||||||
{
|
|
||||||
op_st.lat /= BigInt(op_st.n_osd);
|
|
||||||
delete op_st.n_osd;
|
|
||||||
}
|
|
||||||
if (op_st.bps > 0 || op_st.iops > 0)
|
|
||||||
nonzero = true;
|
|
||||||
}
|
|
||||||
if (!nonzero && (!state.config.inode[pool_id] || !state.config.inode[pool_id][inode_num]))
|
|
||||||
{
|
|
||||||
// Deleted inode (no data, no I/O, no config)
|
|
||||||
delete inode_stats[pool_id][inode_num];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return { inode_stats, seen_pools };
|
|
||||||
}
|
|
||||||
|
|
||||||
function serialize_bigints(obj)
|
|
||||||
{
|
|
||||||
obj = { ...obj };
|
|
||||||
for (const k in obj)
|
|
||||||
{
|
|
||||||
if (typeof obj[k] == 'bigint')
|
|
||||||
{
|
|
||||||
obj[k] = ''+obj[k];
|
|
||||||
}
|
|
||||||
else if (typeof obj[k] == 'object')
|
|
||||||
{
|
|
||||||
obj[k] = serialize_bigints(obj[k]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return obj;
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
derive_osd_stats,
|
|
||||||
sum_op_stats,
|
|
||||||
sum_object_counts,
|
|
||||||
sum_inode_stats,
|
|
||||||
serialize_bigints,
|
|
||||||
};
|
|
|
@ -8,7 +8,7 @@
|
||||||
// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().
|
// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().
|
||||||
|
|
||||||
const { SimpleCombinator } = require('./simple_pgs.js');
|
const { SimpleCombinator } = require('./simple_pgs.js');
|
||||||
const LPOptimizer = require('./lp_optimizer.js');
|
const LPOptimizer = require('./lp-optimizer.js');
|
||||||
|
|
||||||
const osd_tree = {
|
const osd_tree = {
|
||||||
ripper5: {
|
ripper5: {
|
|
@ -2,7 +2,7 @@
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
const { compat } = require('./simple_pgs.js');
|
const { compat } = require('./simple_pgs.js');
|
||||||
const LPOptimizer = require('./lp_optimizer.js');
|
const LPOptimizer = require('./lp-optimizer.js');
|
||||||
|
|
||||||
async function run()
|
async function run()
|
||||||
{
|
{
|
|
@ -2,7 +2,7 @@
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
const { compat, flatten_tree } = require('./simple_pgs.js');
|
const { compat, flatten_tree } = require('./simple_pgs.js');
|
||||||
const LPOptimizer = require('./lp_optimizer.js');
|
const LPOptimizer = require('./lp-optimizer.js');
|
||||||
|
|
||||||
const crush_tree = [
|
const crush_tree = [
|
||||||
{ level: 1, children: [
|
{ level: 1, children: [
|
|
@ -2,7 +2,7 @@
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
const { compat } = require('./simple_pgs.js');
|
const { compat } = require('./simple_pgs.js');
|
||||||
const LPOptimizer = require('./lp_optimizer.js');
|
const LPOptimizer = require('./lp-optimizer.js');
|
||||||
|
|
||||||
const osd_tree = {
|
const osd_tree = {
|
||||||
100: {
|
100: {
|
|
@ -2,7 +2,7 @@
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
const { compat, flatten_tree } = require('./simple_pgs.js');
|
const { compat, flatten_tree } = require('./simple_pgs.js');
|
||||||
const LPOptimizer = require('./lp_optimizer.js');
|
const LPOptimizer = require('./lp-optimizer.js');
|
||||||
|
|
||||||
const osd_tree = {
|
const osd_tree = {
|
||||||
100: {
|
100: {
|
|
@ -108,11 +108,10 @@ npm install --production
|
||||||
cd ..
|
cd ..
|
||||||
mkdir -p %buildroot/usr/lib/vitastor
|
mkdir -p %buildroot/usr/lib/vitastor
|
||||||
cp -r mon %buildroot/usr/lib/vitastor
|
cp -r mon %buildroot/usr/lib/vitastor
|
||||||
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
|
|
||||||
mkdir -p %buildroot/lib/systemd/system
|
mkdir -p %buildroot/lib/systemd/system
|
||||||
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
|
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||||
mkdir -p %buildroot/lib/udev/rules.d
|
mkdir -p %buildroot/lib/udev/rules.d
|
||||||
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
|
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||||
|
|
||||||
|
|
||||||
%files
|
%files
|
||||||
|
|
|
@ -105,11 +105,10 @@ npm install --production
|
||||||
cd ..
|
cd ..
|
||||||
mkdir -p %buildroot/usr/lib/vitastor
|
mkdir -p %buildroot/usr/lib/vitastor
|
||||||
cp -r mon %buildroot/usr/lib/vitastor
|
cp -r mon %buildroot/usr/lib/vitastor
|
||||||
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
|
|
||||||
mkdir -p %buildroot/lib/systemd/system
|
mkdir -p %buildroot/lib/systemd/system
|
||||||
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
|
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||||
mkdir -p %buildroot/lib/udev/rules.d
|
mkdir -p %buildroot/lib/udev/rules.d
|
||||||
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
|
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||||
|
|
||||||
|
|
||||||
%files
|
%files
|
||||||
|
|
|
@ -98,11 +98,10 @@ npm install --production
|
||||||
cd ..
|
cd ..
|
||||||
mkdir -p %buildroot/usr/lib/vitastor
|
mkdir -p %buildroot/usr/lib/vitastor
|
||||||
cp -r mon %buildroot/usr/lib/vitastor
|
cp -r mon %buildroot/usr/lib/vitastor
|
||||||
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
|
|
||||||
mkdir -p %buildroot/lib/systemd/system
|
mkdir -p %buildroot/lib/systemd/system
|
||||||
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
|
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||||
mkdir -p %buildroot/lib/udev/rules.d
|
mkdir -p %buildroot/lib/udev/rules.d
|
||||||
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
|
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||||
|
|
||||||
|
|
||||||
%files
|
%files
|
||||||
|
|
Loading…
Reference in New Issue