Compare commits

..

4 Commits

Author SHA1 Message Date
Vitaliy Filippov df668286fb Add Grafana dashboard
Test / test_snapshot_chain_ec (push) Successful in 2m58s Details
Test / test_rebalance_verify_imm (push) Successful in 4m27s Details
Test / test_root_node (push) Successful in 12s Details
Test / test_rebalance_verify (push) Successful in 5m7s Details
Test / test_switch_primary (push) Successful in 37s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m24s Details
Test / test_write (push) Successful in 41s Details
Test / test_write_no_same (push) Successful in 23s Details
Test / test_write_xor (push) Successful in 2m2s Details
Test / test_rebalance_verify_ec (push) Successful in 6m4s Details
Test / test_heal_ec (push) Successful in 4m6s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m40s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m13s Details
Test / test_heal_pg_size_2 (push) Failing after 10m31s Details
Test / test_heal_csum_32k (push) Successful in 6m6s Details
Test / test_osd_tags (push) Successful in 46s Details
Test / test_heal_csum_4k_dmj (push) Successful in 5m45s Details
Test / test_heal_csum_4k_dj (push) Successful in 5m56s Details
Test / test_enospc (push) Successful in 1m58s Details
Test / test_enospc_xor (push) Successful in 2m17s Details
Test / test_enospc_imm (push) Successful in 1m26s Details
Test / test_enospc_imm_xor (push) Successful in 1m57s Details
Test / test_scrub_zero_osd_2 (push) Successful in 39s Details
Test / test_scrub (push) Successful in 44s Details
Test / test_heal_csum_4k (push) Successful in 5m21s Details
Test / test_scrub_xor (push) Successful in 40s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 42s Details
Test / test_nfs (push) Successful in 18s Details
Test / test_scrub_pg_size_3 (push) Successful in 56s Details
Test / test_scrub_ec (push) Successful in 25s Details
2024-07-09 02:39:36 +03:00
Vitaliy Filippov 667c5999c9 Report all PG states
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 6m25s Details
Test / test_root_node (push) Successful in 14s Details
Test / test_rebalance_verify (push) Successful in 7m1s Details
Test / test_switch_primary (push) Successful in 40s Details
Test / test_write (push) Successful in 43s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 5m40s Details
Test / test_write_no_same (push) Successful in 19s Details
Test / test_write_xor (push) Successful in 1m21s Details
Test / test_rebalance_verify_ec (push) Successful in 8m11s Details
Test / test_heal_pg_size_2 (push) Successful in 3m51s Details
Test / test_heal_csum_32k_dj (push) Successful in 4m49s Details
Test / test_heal_csum_32k (push) Successful in 4m34s Details
Test / test_heal_ec (push) Failing after 10m27s Details
Test / test_heal_csum_4k_dmj (push) Successful in 4m27s Details
Test / test_heal_csum_32k_dmj (push) Failing after 10m28s Details
Test / test_osd_tags (push) Successful in 41s Details
Test / test_heal_csum_4k_dj (push) Successful in 4m33s Details
Test / test_enospc (push) Successful in 1m41s Details
Test / test_enospc_xor (push) Successful in 2m20s Details
Test / test_enospc_imm (push) Successful in 1m28s Details
Test / test_enospc_imm_xor (push) Successful in 1m54s Details
Test / test_scrub (push) Successful in 37s Details
Test / test_scrub_zero_osd_2 (push) Successful in 53s Details
Test / test_heal_csum_4k (push) Successful in 4m52s Details
Test / test_scrub_xor (push) Successful in 32s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 29s Details
Test / test_nfs (push) Successful in 20s Details
Test / test_scrub_ec (push) Successful in 28s Details
Test / test_scrub_pg_size_3 (push) Successful in 55s Details
2024-07-08 19:52:56 +03:00
Vitaliy Filippov 8ad63465cd Do not wipe previous metrics at moments when difference is 0
Test / test_snapshot_chain_ec (push) Successful in 2m48s Details
Test / test_rebalance_verify_imm (push) Successful in 3m24s Details
Test / test_root_node (push) Successful in 12s Details
Test / test_rebalance_verify (push) Successful in 4m3s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 41s Details
Test / test_write_no_same (push) Successful in 20s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 2m59s Details
Test / test_write_xor (push) Successful in 1m16s Details
Test / test_rebalance_verify_ec (push) Successful in 5m58s Details
Test / test_heal_pg_size_2 (push) Successful in 4m7s Details
Test / test_heal_ec (push) Successful in 4m3s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m43s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m10s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m28s Details
Test / test_osd_tags (push) Successful in 31s Details
Test / test_enospc (push) Successful in 56s Details
Test / test_enospc_xor (push) Successful in 1m21s Details
Test / test_enospc_imm (push) Successful in 43s Details
Test / test_heal_csum_32k (push) Failing after 10m22s Details
Test / test_scrub (push) Successful in 30s Details
Test / test_enospc_imm_xor (push) Successful in 51s Details
Test / test_scrub_xor (push) Successful in 31s Details
Test / test_scrub_zero_osd_2 (push) Successful in 34s Details
Test / test_heal_csum_4k_dj (push) Successful in 10m6s Details
Test / test_scrub_ec (push) Successful in 35s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 40s Details
Test / test_scrub_pg_size_3 (push) Successful in 50s Details
Test / test_nfs (push) Successful in 15s Details
Test / test_heal_csum_4k (push) Successful in 8m28s Details
2024-07-08 02:20:12 +03:00
Vitaliy Filippov 976290e6a9 Implement built-in Prometheus exporter in monitor 2024-07-08 02:20:12 +03:00
9 changed files with 3178 additions and 4 deletions

View File

@ -8,6 +8,13 @@
These parameters only apply to Monitors.
- [enable_prometheus](#enable_prometheus)
- [mon_http_port](#mon_http_port)
- [mon_http_ip](#mon_http_ip)
- [mon_https_cert](#mon_https_cert)
- [mon_https_key](#mon_https_key)
- [mon_https_client_auth](#mon_https_client_auth)
- [mon_https_ca](#mon_https_ca)
- [etcd_mon_ttl](#etcd_mon_ttl)
- [etcd_mon_timeout](#etcd_mon_timeout)
- [etcd_mon_retries](#etcd_mon_retries)
@ -17,6 +24,51 @@ These parameters only apply to Monitors.
- [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)
## enable_prometheus
- Type: boolean
- Default: true
Enable built-in Prometheus metrics exporter
## mon_http_port
- Type: integer
- Default: 8060
HTTP port for monitors to listen on (including metrics exporter)
## mon_http_ip
- Type: string
IP address for monitors to listen on (all addresses by default)
## mon_https_cert
- Type: string
Path to PEM SSL certificate file for monitor to listen using HTTPS
## mon_https_key
- Type: string
Path to PEM SSL private key file for monitor to listen using HTTPS
## mon_https_client_auth
- Type: boolean
- Default: false
Enable HTTPS client certificate-based authorization for monitor connections
## mon_https_ca
- Type: string
Path to CA certificate for client HTTPS authorization
## etcd_mon_ttl
- Type: seconds

View File

@ -8,6 +8,13 @@
Данные параметры используются только мониторами Vitastor.
- [enable_prometheus](#enable_prometheus)
- [mon_http_port](#mon_http_port)
- [mon_http_ip](#mon_http_ip)
- [mon_https_cert](#mon_https_cert)
- [mon_https_key](#mon_https_key)
- [mon_https_client_auth](#mon_https_client_auth)
- [mon_https_ca](#mon_https_ca)
- [etcd_mon_ttl](#etcd_mon_ttl)
- [etcd_mon_timeout](#etcd_mon_timeout)
- [etcd_mon_retries](#etcd_mon_retries)
@ -17,6 +24,51 @@
- [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)
## enable_prometheus
- Тип: булево (да/нет)
- Значение по умолчанию: true
Включить встроенный Prometheus-экспортер метри
## mon_http_port
- Тип: целое число
- Значение по умолчанию: 8060
Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
## mon_http_ip
- Тип: строка
IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
## mon_https_cert
- Тип: строка
Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
## mon_https_key
- Тип: строка
Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
## mon_https_client_auth
- Тип: булево (да/нет)
- Значение по умолчанию: false
Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
## mon_https_ca
- Тип: строка
Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
## etcd_mon_ttl
- Тип: секунды

View File

@ -1,3 +1,34 @@
- name: enable_prometheus
type: bool
default: true
info: Enable built-in Prometheus metrics exporter
info_ru: Включить встроенный Prometheus-экспортер метри
- name: mon_http_port
type: int
default: 8060
info: HTTP port for monitors to listen on (including metrics exporter)
info_ru: Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
- name: mon_http_ip
type: string
info: IP address for monitors to listen on (all addresses by default)
info_ru: IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
- name: mon_https_cert
type: string
info: Path to PEM SSL certificate file for monitor to listen using HTTPS
info_ru: Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
- name: mon_https_key
type: string
info: Path to PEM SSL private key file for monitor to listen using HTTPS
info_ru: Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
- name: mon_https_client_auth
type: bool
default: false
info: Enable HTTPS client certificate-based authorization for monitor connections
info_ru: Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
- name: mon_https_ca
type: string
info: Path to CA certificate for client HTTPS authorization
info_ru: Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
- name: etcd_mon_ttl
type: sec
min: 5

View File

@ -245,6 +245,9 @@ const etcd_tree = {
stats: {
/* <osd_num_t>: {
time: number, // unix time
data_block_size: uint64_t, // bytes
bitmap_granularity: uint64_t, // bytes
immediate_commit: "all"|"small"|"none",
blockstore_ready: boolean,
size: uint64_t, // bytes
free: uint64_t, // bytes
@ -282,7 +285,7 @@ const etcd_tree = {
master: {
/* ip: [ string ], id: uint64_t */
},
standby: {
member: {
/* <uint64_t>: { ip: [ string ] }, */
},
},

46
mon/http_server.js Normal file
View File

@ -0,0 +1,46 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const fsp = require('fs').promises;
const http = require('http');
const https = require('https');
async function create_http_server(cfg, handler)
{
let server;
if (cfg.mon_https_cert)
{
const tls = {
key: await fsp.readFile(cfg.mon_https_key),
cert: await fsp.readFile(cfg.mon_https_cert),
};
if (cfg.mon_https_ca)
{
tls.mon_https_ca = await fsp.readFile(cfg.mon_https_ca);
}
if (cfg.mon_https_client_auth)
{
tls.requestCert = true;
}
server = https.createServer(tls, handler);
}
else
{
server = http.createServer(handler);
}
try
{
server.listen(cfg.mon_http_port || 8060, cfg.mon_http_ip || undefined);
}
catch (e)
{
console.error(
'HTTP server disabled because listen at address: '+
(cfg.mon_http_ip || '')+':'+(cfg.mon_http_port || 9090)+' failed with error: '+e
);
return null;
}
return server;
}
module.exports = { create_http_server };

View File

@ -1,10 +1,13 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const { URL } = require('url');
const fs = require('fs');
const crypto = require('crypto');
const os = require('os');
const EtcdAdapter = require('./etcd_adapter.js');
const { create_http_server } = require('./http_server.js');
const { export_prometheus_metrics } = require('./prometheus.js');
const { etcd_tree, etcd_allow, etcd_nonempty_keys } = require('./etcd_schema.js');
const { validate_pool_cfg } = require('./pool_config.js');
const { sum_op_stats, sum_object_counts, sum_inode_stats, serialize_bigints } = require('./stats.js');
@ -60,6 +63,32 @@ class Mon
this.recheck_pgs_active = false;
this.etcd = new EtcdAdapter(this);
this.etcd.parse_config(this.config);
this.watcher_active = false;
if (this.config.enable_prometheus || !('enable_prometheus' in this.config))
{
this.http = create_http_server(this.config, (req, res) =>
{
const u = new URL(req.url, 'http://'+(req.headers.host || 'localhost'));
if (u.pathname.replace(/\/+$/, '') == (this.config.prometheus_path||'/metrics'))
{
if (!this.watcher_active)
{
res.writeHead(503);
res.write('Monitor is in standby mode. Please retrieve metrics from master monitor instance\n');
}
else
{
res.write(export_prometheus_metrics(this.state));
}
}
else
{
res.writeHead(404);
res.write('Not found. Metrics path: '+(this.config.prometheus_path||'/metrics\n'));
}
res.end();
});
}
}
async start()
@ -69,6 +98,7 @@ class Mon
await this.etcd.become_master();
await this.load_cluster_state();
await this.etcd.start_watcher(this.config.etcd_mon_retries);
this.watcher_active = true;
for (const pool_id in this.state.config.pools)
{
if (!this.state.pool.stats[pool_id] ||

220
mon/prometheus.js Normal file
View File

@ -0,0 +1,220 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const metric_help =
`# HELP vitastor_object_bytes Total size of objects in cluster in bytes
# TYPE vitastor_object_bytes gauge
# HELP vitastor_object_count Total number of objects in cluster
# TYPE vitastor_object_count gauge
# HELP vitastor_stat_count Total operation count
# TYPE vitastor_stat_count counter
# HELP vitastor_stat_usec Total operation latency in usec
# TYPE vitastor_stat_usec counter
# HELP vitastor_stat_bytes Total operation size in bytes
# HELP vitastor_stat_bytes counter
# HELP vitastor_image_raw_used Image raw used size in bytes
# TYPE vitastor_image_raw_used counter
# HELP vitastor_image_stat_count Per-image total operation count
# TYPE vitastor_image_stat_count counter
# HELP vitastor_image_stat_usec Per-image total operation latency
# TYPE vitastor_image_stat_usec counter
# HELP vitastor_image_stat_bytes Per-image total operation size in bytes
# TYPE vitastor_image_stat_bytes counter
# HELP vitastor_osd_status OSD up/down status
# TYPE vitastor_osd_status gauge
# HELP vitastor_osd_size_bytes OSD total space in bytes
# TYPE vitastor_osd_size_bytes gauge
# HELP vitastor_osd_free_bytes OSD free space in bytes
# TYPE vitastor_osd_free_bytes gauge
# HELP vitastor_osd_stat_count Per-image total operation count
# TYPE vitastor_osd_stat_count counter
# HELP vitastor_osd_stat_usec Per-image total operation latency
# TYPE vitastor_osd_stat_usec counter
# HELP vitastor_osd_stat_bytes Per-image total operation size in bytes
# TYPE vitastor_osd_stat_bytes counter
# HELP vitastor_monitor_info Monitor info, 1 is master, 0 is standby
# TYPE vitastor_monitor_info gauge
# HELP vitastor_pool_info Pool configuration (in labels)
# TYPE vitastor_pool_info gauge
# HELP vitastor_pool_status Pool up/down status
# TYPE vitastor_pool_status gauge
# HELP vitastor_pool_raw_to_usable Raw to usable space ratio
# TYPE vitastor_pool_raw_to_usable gauge
# HELP vitastor_pool_space_efficiency Pool space usage efficiency
# TYPE vitastor_pool_space_efficiency gauge
# HELP vitastor_pool_total_raw_tb Total raw space in pool in TB
# TYPE vitastor_pool_total_raw_tb gauge
# HELP vitastor_pool_used_raw_tb Used raw space in pool in TB
# TYPE vitastor_pool_used_raw_tb gauge
# HELP vitastor_pg_count PG counts by state
# HELP vitastor_pg_count gauge
`;
function export_prometheus_metrics(st)
{
let res = metric_help;
// Global statistics
for (const k in st.stats.object_bytes)
{
res += `vitastor_object_bytes{object_type="${k}"} ${st.stats.object_bytes[k]}\n`;
}
for (const k in st.stats.object_counts)
{
res += `vitastor_object_count{object_type="${k}"} ${st.stats.object_counts[k]}\n`;
}
for (const typ of [ 'op', 'subop', 'recovery' ])
{
for (const op in st.stats[typ+"_stats"]||{})
{
const op_stat = st.stats[typ+"_stats"][op];
for (const key of [ 'count', 'usec', 'bytes' ])
{
res += `vitastor_stat_${key}{op="${op}",op_type="${typ}"} ${op_stat[key]||0}\n`;
}
}
}
// Per-image statistics
for (const pool in st.inode.stats)
{
for (const inode in st.inode.stats[pool])
{
const ist = st.inode.stats[pool][inode];
const inode_name = ((st.config.inode[pool]||{})[inode]||{}).name||'';
const inode_label = `image_name="${addslashes(inode_name)}",inode_num="${inode}",pool_id="${pool}"`;
res += `vitastor_image_raw_used{${inode_label}} ${ist.raw_used||0}\n`;
for (const op of [ 'read', 'write', 'delete' ])
{
for (const k of [ 'count', 'usec', 'bytes' ])
{
if (ist[op])
{
res += `vitastor_image_stat_${k}{${inode_label},op="${op}"} ${ist[op][k]||0}\n`;
}
}
}
}
}
// Per-OSD statistics
for (const osd in st.osd.stats)
{
const osd_stat = st.osd.stats[osd];
const up = st.osd.state[osd] && st.osd.state[osd].state == 'up' ? 1 : 0;
res += `vitastor_osd_status{host="${addslashes(osd_stat.host)}",osd_num="${osd}"} ${up}\n`;
res += `vitastor_osd_size_bytes{osd_num="${osd}"} ${osd_stat.size||0}\n`;
res += `vitastor_osd_free_bytes{osd_num="${osd}"} ${osd_stat.free||0}\n`;
for (const op in osd_stat.op_stats)
{
const ist = osd_stat.op_stats[op];
for (const k of [ 'count', 'usec', 'bytes' ])
{
res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="op"} ${ist[k]||0}\n`;
}
}
for (const op in osd_stat.subop_stats)
{
const ist = osd_stat.subop_stats[op];
for (const k of [ 'count', 'usec', 'bytes' ])
{
res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="subop"} ${ist[k]||0}\n`;
}
}
}
// Monitor statistics
for (const mon_id in st.mon.member)
{
const mon = st.mon.member[mon_id];
const master = st.mon.master && st.mon.master.id == mon_id ? 1 : 0;
const ip = (mon.ip instanceof Array ? mon.ip[0] : mon.ip) || '';
res += `vitastor_monitor_info{monitor_hostname="${addslashes(mon.hostname)}",monitor_id="${mon_id}",monitor_ip="${addslashes(ip)}"} ${master}\n`;
}
// Per-pool statistics
for (const pool_id in st.config.pools)
{
const pool_cfg = st.config.pools[pool_id];
const pool_label = `pool_id="${pool_id}",pool_name="${addslashes(pool_cfg.name)}"`;
const pool_stat = st.pool.stats[pool_id];
res += `vitastor_pool_info{${pool_label}`+
`,pool_scheme="${addslashes(pool_cfg.scheme)}"`+
`,pg_size="${pool_cfg.pg_size||0}",pg_minsize="${pool_cfg.pg_minsize||0}"`+
`,parity_chunks="${pool_cfg.parity_chunks||0}",pg_count="${pool_cfg.pg_count||0}"`+
`,failure_domain="${addslashes(pool_cfg.failure_domain)}"`+
`} 1\n`;
if (!pool_stat)
{
continue;
}
res += `vitastor_pool_raw_to_usable{${pool_label}} ${pool_stat.raw_to_usable||0}\n`;
res += `vitastor_pool_space_efficiency{${pool_label}} ${pool_stat.space_efficiency||0}\n`;
res += `vitastor_pool_total_raw_tb{${pool_label}} ${pool_stat.total_raw_tb||0}\n`;
res += `vitastor_pool_used_raw_tb{${pool_label}} ${pool_stat.used_raw_tb||0}\n`;
// PG states and pool up/down status
const real_pg_count = (Object.keys(((st.config.pgs||{}).items||{})[pool_id]||{}).length) || (0|pool_cfg.pg_count);
const per_state = {
active: 0,
starting: 0,
peering: 0,
incomplete: 0,
repeering: 0,
stopping: 0,
offline: 0,
degraded: 0,
has_inconsistent: 0,
has_corrupted: 0,
has_incomplete: 0,
has_degraded: 0,
has_misplaced: 0,
has_unclean: 0,
has_invalid: 0,
left_on_dead: 0,
scrubbing: 0,
};
const pool_pg_states = st.pg.state[pool_id] || {};
for (let i = 1; i <= real_pg_count; i++)
{
if (!pool_pg_states[i])
{
per_state['offline'] = 1 + (per_state['offline']|0);
}
else
{
for (const st_name of pool_pg_states[i].state)
{
per_state[st_name] = 1 + (per_state[st_name]|0);
}
}
}
for (const st_name in per_state)
{
res += `vitastor_pg_count{pg_state="${st_name}",${pool_label}} ${per_state[st_name]}\n`;
}
const pool_active = per_state['active'] >= real_pg_count ? 1 : 0;
res += `vitastor_pool_status{${pool_label}} ${pool_active}\n`;
}
return res;
}
function addslashes(str)
{
return ((str||'')+'').replace(/(["\n\\])/g, "\\$1"); // escape " \n \
}
module.exports = { export_prometheus_metrics };

File diff suppressed because it is too large Load Diff

View File

@ -3,10 +3,10 @@
function derive_osd_stats(st, prev, prev_diff)
{
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
const diff = prev_diff || { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
{
return prev_diff || diff;
return diff;
}
const timediff = BigInt(st.time*1000 - prev.time*1000);
for (const op in st.op_stats||{})
@ -74,7 +74,7 @@ function sum_op_stats(all_osd, prev_stats)
);
prev_stats.osd_stats[osd] = cur;
}
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: { degraded: {}, misplaced: {} } };
// Sum derived values instead of deriving summed
for (const osd in all_osd.state)
{