Compare commits
31 Commits
antietcd
...
msgr-iothr
Author | SHA1 | Date | |
---|---|---|---|
249a233b37 | |||
d07e072212 | |||
21d1171ba4 | |||
![]() |
8f83086889 | ||
ceb18f25db | |||
ed51a89f70 | |||
f59456f22d | |||
ca63cd507d | |||
ea0d72289c | |||
e400a851f4 | |||
0fec7a9fea | |||
b9de2a92a9 | |||
5360a70853 | |||
4c2328eb13 | |||
313daef12d | |||
ad9c12e1b9 | |||
4473eb5512 | |||
6501abc060 | |||
1228403e74 | |||
4eabebd245 | |||
cf60b6818c | |||
1a4a7cdc37 | |||
1b48085e21 | |||
a71847244e | |||
848c2d2722 | |||
86832dc43f | |||
1f6da79463 | |||
9bf57c3760 | |||
a0305b5b4a | |||
1546f8e447 | |||
8ce962b312 |
5
debian/vitastor-mon.install
vendored
5
debian/vitastor-mon.install
vendored
@@ -1,2 +1,3 @@
|
||||
mon usr/lib/vitastor
|
||||
mon/vitastor-mon.service /lib/systemd/system
|
||||
mon usr/lib/vitastor/mon
|
||||
mon/scripts/make-etcd usr/lib/vitastor/mon
|
||||
mon/scripts/vitastor-mon.service /lib/systemd/system
|
||||
|
6
debian/vitastor-osd.install
vendored
6
debian/vitastor-osd.install
vendored
@@ -1,6 +1,6 @@
|
||||
usr/bin/vitastor-osd
|
||||
usr/bin/vitastor-disk
|
||||
usr/bin/vitastor-dump-journal
|
||||
mon/vitastor-osd@.service /lib/systemd/system
|
||||
mon/vitastor.target /lib/systemd/system
|
||||
mon/90-vitastor.rules /lib/udev/rules.d
|
||||
mon/scripts/vitastor-osd@.service /lib/systemd/system
|
||||
mon/scripts/vitastor.target /lib/systemd/system
|
||||
mon/scripts/90-vitastor.rules /lib/udev/rules.d
|
||||
|
@@ -248,7 +248,7 @@ etcd_report_interval to guarantee that keepalive actually works.
|
||||
## etcd_ws_keepalive_interval
|
||||
|
||||
- Type: seconds
|
||||
- Default: 30
|
||||
- Default: 5
|
||||
- Can be changed online: yes
|
||||
|
||||
etcd websocket ping interval required to keep the connection alive and
|
||||
|
@@ -259,7 +259,7 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
|
||||
## etcd_ws_keepalive_interval
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 30
|
||||
- Значение по умолчанию: 5
|
||||
- Можно менять на лету: да
|
||||
|
||||
Интервал проверки живости вебсокет-подключений к etcd.
|
||||
|
@@ -282,7 +282,7 @@
|
||||
etcd_report_interval, чтобы keepalive гарантированно работал.
|
||||
- name: etcd_ws_keepalive_interval
|
||||
type: sec
|
||||
default: 30
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
etcd websocket ping interval required to keep the connection alive and
|
||||
|
@@ -22,7 +22,7 @@
|
||||
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
|
||||
[here](../config/layout-cluster.en.md#immediate_commit).
|
||||
- If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
|
||||
Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
|
||||
Toshiba MG, Seagate EXOS or something similar. If your drives don't have such cache then
|
||||
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
|
||||
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
|
||||
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
||||
@@ -33,7 +33,7 @@
|
||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
|
||||
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
|
||||
|
||||
## Configure monitors
|
||||
|
||||
|
@@ -123,4 +123,4 @@ vitastor-cli create -s 10G testimg
|
||||
Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
|
||||
а также кластерную файловую систему, то:
|
||||
|
||||
- [Следуйте инструкциям](../usage/nfs.en.md#vitastorfs)
|
||||
- [Следуйте инструкциям](../usage/nfs.ru.md#vitastorfs)
|
||||
|
@@ -11,6 +11,7 @@ module.exports = {
|
||||
"ecmaVersion": 2020
|
||||
},
|
||||
"plugins": [
|
||||
"import"
|
||||
],
|
||||
"rules": {
|
||||
"indent": [
|
||||
@@ -44,6 +45,10 @@ module.exports = {
|
||||
],
|
||||
"node/shebang": [
|
||||
"off"
|
||||
],
|
||||
"import/no-unresolved": [
|
||||
2,
|
||||
{ "commonjs": true }
|
||||
]
|
||||
}
|
||||
};
|
||||
|
356
mon/etcd_adapter.js
Normal file
356
mon/etcd_adapter.js
Normal file
@@ -0,0 +1,356 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const http = require('http');
|
||||
const WebSocket = require('ws');
|
||||
|
||||
const MON_STOPPED = 'Monitor instance is stopped';
|
||||
|
||||
class EtcdAdapter
|
||||
{
|
||||
constructor(mon)
|
||||
{
|
||||
this.mon = mon;
|
||||
this.ws = null;
|
||||
this.ws_alive = false;
|
||||
this.ws_keepalive_timer = null;
|
||||
}
|
||||
|
||||
parse_config(config)
|
||||
{
|
||||
this.parse_etcd_addresses(config.etcd_address||config.etcd_url);
|
||||
}
|
||||
|
||||
parse_etcd_addresses(addrs)
|
||||
{
|
||||
const is_local_ip = this.mon.local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
|
||||
this.etcd_local = [];
|
||||
this.etcd_urls = [];
|
||||
this.selected_etcd_url = null;
|
||||
this.etcd_urls_to_try = [];
|
||||
if (!(addrs instanceof Array))
|
||||
addrs = addrs ? (''+(addrs||'')).split(/,/) : [];
|
||||
if (!addrs.length)
|
||||
{
|
||||
console.error('Vitastor etcd address(es) not specified. Please set on the command line or in the config file');
|
||||
process.exit(1);
|
||||
}
|
||||
for (let url of addrs)
|
||||
{
|
||||
let scheme = 'http';
|
||||
url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
|
||||
const slash = url.indexOf('/');
|
||||
const colon = url.indexOf(':');
|
||||
const is_local = is_local_ip[colon >= 0 ? url.substr(0, colon) : (slash >= 0 ? url.substr(0, slash) : url)];
|
||||
url = scheme+'://'+(slash >= 0 ? url : url+'/v3');
|
||||
if (is_local)
|
||||
this.etcd_local.push(url);
|
||||
else
|
||||
this.etcd_urls.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
pick_next_etcd()
|
||||
{
|
||||
if (this.selected_etcd_url)
|
||||
return this.selected_etcd_url;
|
||||
if (!this.etcd_urls_to_try || !this.etcd_urls_to_try.length)
|
||||
{
|
||||
this.etcd_urls_to_try = [ ...this.etcd_local ];
|
||||
const others = [ ...this.etcd_urls ];
|
||||
while (others.length)
|
||||
{
|
||||
const url = others.splice(0|(others.length*Math.random()), 1);
|
||||
this.etcd_urls_to_try.push(url[0]);
|
||||
}
|
||||
}
|
||||
this.selected_etcd_url = this.etcd_urls_to_try.shift();
|
||||
return this.selected_etcd_url;
|
||||
}
|
||||
|
||||
stop_watcher(cur_addr)
|
||||
{
|
||||
cur_addr = cur_addr || this.selected_etcd_url;
|
||||
if (this.ws)
|
||||
{
|
||||
console.log('Disconnected from etcd at '+this.ws_used_url);
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
if (this.ws_keepalive_timer)
|
||||
{
|
||||
clearInterval(this.ws_keepalive_timer);
|
||||
this.ws_keepalive_timer = null;
|
||||
}
|
||||
if (this.selected_etcd_url == cur_addr)
|
||||
{
|
||||
this.selected_etcd_url = null;
|
||||
}
|
||||
}
|
||||
|
||||
restart_watcher(cur_addr)
|
||||
{
|
||||
this.stop_watcher(cur_addr);
|
||||
this.start_watcher(this.mon.config.etcd_mon_retries).catch(this.mon.die);
|
||||
}
|
||||
|
||||
async start_watcher(retries)
|
||||
{
|
||||
let retry = 0;
|
||||
if (!retries || retries < 1)
|
||||
{
|
||||
retries = 1;
|
||||
}
|
||||
const tried = {};
|
||||
while (retries < 0 || retry < retries)
|
||||
{
|
||||
const cur_addr = this.pick_next_etcd();
|
||||
const base = 'ws'+cur_addr.substr(4);
|
||||
let now = Date.now();
|
||||
if (tried[base] && now-tried[base] < this.mon.config.etcd_start_timeout)
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout-(now-tried[base])));
|
||||
now = Date.now();
|
||||
}
|
||||
tried[base] = now;
|
||||
if (this.mon.stopped)
|
||||
{
|
||||
return;
|
||||
}
|
||||
const ok = await new Promise(ok =>
|
||||
{
|
||||
const timer_id = setTimeout(() =>
|
||||
{
|
||||
if (this.ws)
|
||||
{
|
||||
console.log('Disconnected from etcd at '+this.ws_used_url);
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
ok(false);
|
||||
}, this.mon.config.etcd_mon_timeout);
|
||||
this.ws = new WebSocket(base+'/watch');
|
||||
this.ws_used_url = cur_addr;
|
||||
const fail = () =>
|
||||
{
|
||||
ok(false);
|
||||
};
|
||||
this.ws.on('error', fail);
|
||||
this.ws.on('open', () =>
|
||||
{
|
||||
this.ws.removeListener('error', fail);
|
||||
if (timer_id)
|
||||
clearTimeout(timer_id);
|
||||
ok(true);
|
||||
});
|
||||
});
|
||||
if (ok)
|
||||
break;
|
||||
if (this.selected_etcd_url == cur_addr)
|
||||
this.selected_etcd_url = null;
|
||||
this.ws = null;
|
||||
retry++;
|
||||
}
|
||||
if (!this.ws)
|
||||
{
|
||||
this.mon.die('Failed to open etcd watch websocket');
|
||||
return;
|
||||
}
|
||||
if (this.mon.stopped)
|
||||
{
|
||||
this.stop_watcher();
|
||||
return;
|
||||
}
|
||||
const cur_addr = this.selected_etcd_url;
|
||||
this.ws_alive = true;
|
||||
this.ws_keepalive_timer = setInterval(() =>
|
||||
{
|
||||
if (this.ws_alive && this.ws)
|
||||
{
|
||||
this.ws_alive = false;
|
||||
this.ws.send(JSON.stringify({ progress_request: {} }));
|
||||
}
|
||||
else
|
||||
{
|
||||
console.log('etcd websocket timed out, restarting it');
|
||||
this.restart_watcher(cur_addr);
|
||||
}
|
||||
}, (Number(this.mon.config.etcd_ws_keepalive_interval) || 5)*1000);
|
||||
this.ws.on('error', () => this.restart_watcher(cur_addr));
|
||||
this.ws.send(JSON.stringify({
|
||||
create_request: {
|
||||
key: b64(this.mon.config.etcd_prefix+'/'),
|
||||
range_end: b64(this.mon.config.etcd_prefix+'0'),
|
||||
start_revision: ''+this.mon.etcd_watch_revision,
|
||||
watch_id: 1,
|
||||
progress_notify: true,
|
||||
},
|
||||
}));
|
||||
this.ws.on('message', (msg) =>
|
||||
{
|
||||
if (this.mon.stopped)
|
||||
{
|
||||
this.stop_watcher();
|
||||
return;
|
||||
}
|
||||
this.ws_alive = true;
|
||||
let data;
|
||||
try
|
||||
{
|
||||
data = JSON.parse(msg);
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
}
|
||||
if (!data || !data.result)
|
||||
{
|
||||
console.error('Unknown message received from watch websocket: '+msg);
|
||||
}
|
||||
else if (data.result.canceled)
|
||||
{
|
||||
// etcd watch canceled
|
||||
if (data.result.compact_revision)
|
||||
{
|
||||
// we may miss events if we proceed
|
||||
this.mon.die('Revisions before '+data.result.compact_revision+' were compacted by etcd, exiting');
|
||||
}
|
||||
this.mon.die('Watch canceled by etcd, reason: '+data.result.cancel_reason+', exiting');
|
||||
}
|
||||
else if (data.result.created)
|
||||
{
|
||||
// etcd watch created
|
||||
console.log('Successfully subscribed to etcd at '+this.selected_etcd_url+', revision '+data.result.header.revision);
|
||||
}
|
||||
else
|
||||
{
|
||||
this.mon.on_message(data.result);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async become_master()
|
||||
{
|
||||
const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
|
||||
// eslint-disable-next-line no-constant-condition
|
||||
while (1)
|
||||
{
|
||||
const res = await this.etcd_call('/kv/txn', {
|
||||
compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.mon.config.etcd_prefix+'/mon/master') } ],
|
||||
success: [ { requestPut: { key: b64(this.mon.config.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.mon.etcd_lease_id } } ],
|
||||
}, this.mon.config.etcd_start_timeout, 0);
|
||||
if (res.succeeded)
|
||||
{
|
||||
break;
|
||||
}
|
||||
console.log('Waiting to become master');
|
||||
await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout));
|
||||
}
|
||||
console.log('Became master');
|
||||
}
|
||||
|
||||
async etcd_call(path, body, timeout, retries)
|
||||
{
|
||||
let retry = 0;
|
||||
if (retries >= 0 && retries < 1)
|
||||
{
|
||||
retries = 1;
|
||||
}
|
||||
const tried = {};
|
||||
while (retries < 0 || retry < retries)
|
||||
{
|
||||
retry++;
|
||||
const base = this.pick_next_etcd();
|
||||
let now = Date.now();
|
||||
if (tried[base] && now-tried[base] < timeout)
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
|
||||
now = Date.now();
|
||||
}
|
||||
tried[base] = now;
|
||||
if (this.mon.stopped)
|
||||
{
|
||||
throw new Error(MON_STOPPED);
|
||||
}
|
||||
const res = await POST(base+path, body, timeout);
|
||||
if (this.mon.stopped)
|
||||
{
|
||||
throw new Error(MON_STOPPED);
|
||||
}
|
||||
if (res.error)
|
||||
{
|
||||
if (this.selected_etcd_url == base)
|
||||
this.selected_etcd_url = null;
|
||||
console.error('Failed to query etcd '+path+' (retry '+retry+'/'+retries+'): '+res.error);
|
||||
continue;
|
||||
}
|
||||
if (res.json)
|
||||
{
|
||||
if (res.json.error)
|
||||
{
|
||||
console.error(path+': etcd returned error: '+res.json.error);
|
||||
break;
|
||||
}
|
||||
return res.json;
|
||||
}
|
||||
}
|
||||
throw new Error('Failed to query etcd ('+retries+' retries)');
|
||||
}
|
||||
}
|
||||
|
||||
function POST(url, body, timeout)
|
||||
{
|
||||
return new Promise(ok =>
|
||||
{
|
||||
const body_text = Buffer.from(JSON.stringify(body));
|
||||
let timer_id = timeout > 0 ? setTimeout(() =>
|
||||
{
|
||||
if (req)
|
||||
req.abort();
|
||||
req = null;
|
||||
ok({ error: 'timeout' });
|
||||
}, timeout) : null;
|
||||
let req = http.request(url, { method: 'POST', headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Content-Length': body_text.length,
|
||||
} }, (res) =>
|
||||
{
|
||||
if (!req)
|
||||
{
|
||||
return;
|
||||
}
|
||||
clearTimeout(timer_id);
|
||||
let res_body = '';
|
||||
res.setEncoding('utf8');
|
||||
res.on('error', (error) => ok({ error }));
|
||||
res.on('data', chunk => { res_body += chunk; });
|
||||
res.on('end', () =>
|
||||
{
|
||||
if (res.statusCode != 200)
|
||||
{
|
||||
ok({ error: res_body, code: res.statusCode });
|
||||
return;
|
||||
}
|
||||
try
|
||||
{
|
||||
res_body = JSON.parse(res_body);
|
||||
ok({ response: res, json: res_body });
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
ok({ error: e, response: res, body: res_body });
|
||||
}
|
||||
});
|
||||
});
|
||||
req.on('error', (error) => ok({ error }));
|
||||
req.on('close', () => ok({ error: new Error('Connection closed prematurely') }));
|
||||
req.write(body_text);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
function b64(str)
|
||||
{
|
||||
return Buffer.from(str).toString('base64');
|
||||
}
|
||||
|
||||
module.exports = EtcdAdapter;
|
391
mon/etcd_schema.js
Normal file
391
mon/etcd_schema.js
Normal file
@@ -0,0 +1,391 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
// FIXME document all etcd keys and config variables in the form of JSON schema or similar
|
||||
const etcd_nonempty_keys = {
|
||||
'config/global': 1,
|
||||
'config/node_placement': 1,
|
||||
'config/pools': 1,
|
||||
'config/pgs': 1,
|
||||
'history/last_clean_pgs': 1,
|
||||
'stats': 1,
|
||||
};
|
||||
const etcd_allow = new RegExp('^'+[
|
||||
'config/global',
|
||||
'config/node_placement',
|
||||
'config/pools',
|
||||
'config/osd/[1-9]\\d*',
|
||||
'config/pgs',
|
||||
'config/inode/[1-9]\\d*/[1-9]\\d*',
|
||||
'osd/state/[1-9]\\d*',
|
||||
'osd/stats/[1-9]\\d*',
|
||||
'osd/inodestats/[1-9]\\d*',
|
||||
'osd/space/[1-9]\\d*',
|
||||
'mon/master',
|
||||
'mon/member/[a-f0-9]+',
|
||||
'pg/state/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/stats/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
||||
'history/last_clean_pgs',
|
||||
'inode/stats/[1-9]\\d*/\\d+',
|
||||
'pool/stats/[1-9]\\d*',
|
||||
'stats',
|
||||
'index/image/.*',
|
||||
'index/maxid/[1-9]\\d*',
|
||||
].join('$|^')+'$');
|
||||
|
||||
const etcd_tree = {
|
||||
config: {
|
||||
/* global: {
|
||||
// WARNING: NOT ALL OF THESE ARE ACTUALLY CONFIGURABLE HERE
|
||||
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
|
||||
// etcd connection
|
||||
config_path: "/etc/vitastor/vitastor.conf",
|
||||
etcd_prefix: "/vitastor",
|
||||
// etcd connection - configurable online
|
||||
etcd_address: "10.0.115.10:2379/v3",
|
||||
// mon
|
||||
etcd_mon_ttl: 5, // min: 1
|
||||
etcd_mon_timeout: 1000, // ms. min: 0
|
||||
etcd_mon_retries: 5, // min: 0
|
||||
mon_change_timeout: 1000, // ms. min: 100
|
||||
mon_retry_change_timeout: 50, // ms. min: 10
|
||||
mon_stats_timeout: 1000, // ms. min: 100
|
||||
osd_out_time: 600, // seconds. min: 0
|
||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
||||
use_old_pg_combinator: false,
|
||||
// client and osd
|
||||
tcp_header_buffer_size: 65536,
|
||||
use_sync_send_recv: false,
|
||||
use_rdma: true,
|
||||
rdma_device: null, // for example, "rocep5s0f0"
|
||||
rdma_port_num: 1,
|
||||
rdma_gid_index: 0,
|
||||
rdma_mtu: 4096,
|
||||
rdma_max_sge: 128,
|
||||
rdma_max_send: 8,
|
||||
rdma_max_recv: 16,
|
||||
rdma_max_msg: 132096,
|
||||
block_size: 131072,
|
||||
disk_alignment: 4096,
|
||||
bitmap_granularity: 4096,
|
||||
immediate_commit: false, // 'all' or 'small'
|
||||
// client - configurable online
|
||||
client_max_dirty_bytes: 33554432,
|
||||
client_max_dirty_ops: 1024,
|
||||
client_enable_writeback: false,
|
||||
client_max_buffered_bytes: 33554432,
|
||||
client_max_buffered_ops: 1024,
|
||||
client_max_writeback_iodepth: 256,
|
||||
client_retry_interval: 50, // ms. min: 10
|
||||
client_eio_retry_interval: 1000, // ms
|
||||
client_retry_enospc: true,
|
||||
osd_nearfull_ratio: 0.95,
|
||||
// client and osd - configurable online
|
||||
log_level: 0,
|
||||
peer_connect_interval: 5, // seconds. min: 1
|
||||
peer_connect_timeout: 5, // seconds. min: 1
|
||||
osd_idle_timeout: 5, // seconds. min: 1
|
||||
osd_ping_timeout: 5, // seconds. min: 1
|
||||
max_etcd_attempts: 5,
|
||||
etcd_quick_timeout: 1000, // ms
|
||||
etcd_slow_timeout: 5000, // ms
|
||||
etcd_keepalive_timeout: 30, // seconds, default is max(30, etcd_report_interval*2)
|
||||
etcd_ws_keepalive_interval: 5, // seconds
|
||||
// osd
|
||||
etcd_report_interval: 5, // seconds
|
||||
etcd_stats_interval: 30, // seconds
|
||||
run_primary: true,
|
||||
osd_network: null, // "192.168.7.0/24" or an array of masks
|
||||
bind_address: "0.0.0.0",
|
||||
bind_port: 0,
|
||||
readonly: false,
|
||||
osd_memlock: false,
|
||||
// osd - configurable online
|
||||
autosync_interval: 5,
|
||||
autosync_writes: 128,
|
||||
client_queue_depth: 128, // unused
|
||||
recovery_queue_depth: 1,
|
||||
recovery_sleep_us: 0,
|
||||
recovery_tune_util_low: 0.1,
|
||||
recovery_tune_client_util_low: 0,
|
||||
recovery_tune_util_high: 1.0,
|
||||
recovery_tune_client_util_high: 0.5,
|
||||
recovery_tune_interval: 1,
|
||||
recovery_tune_agg_interval: 10, // 10 times recovery_tune_interval
|
||||
recovery_tune_sleep_min_us: 10, // 10 microseconds
|
||||
recovery_pg_switch: 128,
|
||||
recovery_sync_batch: 16,
|
||||
no_recovery: false,
|
||||
no_rebalance: false,
|
||||
print_stats_interval: 3,
|
||||
slow_log_interval: 10,
|
||||
inode_vanish_time: 60,
|
||||
auto_scrub: false,
|
||||
no_scrub: false,
|
||||
scrub_interval: '30d', // 1s/1m/1h/1d
|
||||
scrub_queue_depth: 1,
|
||||
scrub_sleep: 0, // milliseconds
|
||||
scrub_list_limit: 1000, // objects to list on one scrub iteration
|
||||
scrub_find_best: true,
|
||||
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
|
||||
// blockstore - fixed in superblock
|
||||
block_size,
|
||||
disk_alignment,
|
||||
journal_block_size,
|
||||
meta_block_size,
|
||||
bitmap_granularity,
|
||||
journal_device,
|
||||
journal_offset,
|
||||
journal_size,
|
||||
disable_journal_fsync,
|
||||
data_device,
|
||||
data_offset,
|
||||
data_size,
|
||||
disable_data_fsync,
|
||||
meta_device,
|
||||
meta_offset,
|
||||
disable_meta_fsync,
|
||||
disable_device_lock,
|
||||
// blockstore - configurable offline
|
||||
inmemory_metadata,
|
||||
inmemory_journal,
|
||||
journal_sector_buffer_count,
|
||||
journal_no_same_sector_overwrites,
|
||||
// blockstore - configurable online
|
||||
max_write_iodepth,
|
||||
min_flusher_count: 1,
|
||||
max_flusher_count: 256,
|
||||
throttle_small_writes: false,
|
||||
throttle_target_iops: 100,
|
||||
throttle_target_mbs: 100,
|
||||
throttle_target_parallelism: 1,
|
||||
throttle_threshold_us: 50,
|
||||
}, */
|
||||
global: {},
|
||||
/* node_placement: {
|
||||
host1: { level: 'host', parent: 'rack1' },
|
||||
...
|
||||
}, */
|
||||
node_placement: {},
|
||||
/* pools: {
|
||||
<id>: {
|
||||
name: 'testpool',
|
||||
// 'ec' uses Reed-Solomon-Vandermonde codes, 'jerasure' is an alias for 'ec'
|
||||
scheme: 'replicated' | 'xor' | 'ec' | 'jerasure',
|
||||
pg_size: 3,
|
||||
pg_minsize: 2,
|
||||
// number of parity chunks, required for EC
|
||||
parity_chunks?: 1,
|
||||
pg_count: 100,
|
||||
// default is failure_domain=host
|
||||
failure_domain?: 'host',
|
||||
// additional failure domain rules; failure_domain=x is equivalent to x=123..N
|
||||
level_placement?: 'dc=112233 host=123456',
|
||||
raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
|
||||
old_combinator: false,
|
||||
max_osd_combinations: 10000,
|
||||
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
|
||||
block_size: 131072,
|
||||
bitmap_granularity: 4096,
|
||||
// 'all'/'small'/'none', same as in OSD options
|
||||
immediate_commit: 'none',
|
||||
pg_stripe_size: 0,
|
||||
root_node?: 'rack1',
|
||||
// restrict pool to OSDs having all of these tags
|
||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// prefer to put primary on OSD with these tags
|
||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// scrub interval
|
||||
scrub_interval?: '30d',
|
||||
},
|
||||
...
|
||||
}, */
|
||||
pools: {},
|
||||
osd: {
|
||||
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
|
||||
},
|
||||
/* pgs: {
|
||||
hash: string,
|
||||
items: {
|
||||
<pool_id>: {
|
||||
<pg_id>: {
|
||||
osd_set: [ 1, 2, 3 ],
|
||||
primary: 1,
|
||||
pause: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
}, */
|
||||
pgs: {},
|
||||
/* inode: {
|
||||
<pool_id>: {
|
||||
<inode_t>: {
|
||||
name: string,
|
||||
size?: uint64_t, // bytes
|
||||
parent_pool?: <pool_id>,
|
||||
parent_id?: <inode_t>,
|
||||
readonly?: boolean,
|
||||
}
|
||||
}
|
||||
}, */
|
||||
inode: {},
|
||||
},
|
||||
osd: {
|
||||
state: {
|
||||
/* <osd_num_t>: {
|
||||
state: "up",
|
||||
addresses: string[],
|
||||
host: string,
|
||||
port: uint16_t,
|
||||
primary_enabled: boolean,
|
||||
blockstore_enabled: boolean,
|
||||
}, */
|
||||
},
|
||||
stats: {
|
||||
/* <osd_num_t>: {
|
||||
time: number, // unix time
|
||||
blockstore_ready: boolean,
|
||||
size: uint64_t, // bytes
|
||||
free: uint64_t, // bytes
|
||||
host: string,
|
||||
op_stats: {
|
||||
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
},
|
||||
subop_stats: {
|
||||
<string>: { count: uint64_t, usec: uint64_t },
|
||||
},
|
||||
recovery_stats: {
|
||||
degraded: { count: uint64_t, bytes: uint64_t },
|
||||
misplaced: { count: uint64_t, bytes: uint64_t },
|
||||
},
|
||||
}, */
|
||||
},
|
||||
inodestats: {
|
||||
/* <pool_id>: {
|
||||
<inode_t>: {
|
||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
},
|
||||
}, */
|
||||
},
|
||||
space: {
|
||||
/* <osd_num_t>: {
|
||||
<pool_id>: {
|
||||
<inode_t>: uint64_t, // bytes
|
||||
},
|
||||
}, */
|
||||
},
|
||||
},
|
||||
mon: {
|
||||
master: {
|
||||
/* ip: [ string ], id: uint64_t */
|
||||
},
|
||||
standby: {
|
||||
/* <uint64_t>: { ip: [ string ] }, */
|
||||
},
|
||||
},
|
||||
pg: {
|
||||
state: {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
|
||||
}
|
||||
}, */
|
||||
},
|
||||
stats: {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
object_count: uint64_t,
|
||||
clean_count: uint64_t,
|
||||
misplaced_count: uint64_t,
|
||||
degraded_count: uint64_t,
|
||||
incomplete_count: uint64_t,
|
||||
write_osd_set: osd_num_t[],
|
||||
},
|
||||
}, */
|
||||
},
|
||||
history: {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
osd_sets: osd_num_t[][],
|
||||
all_peers: osd_num_t[],
|
||||
epoch: uint64_t,
|
||||
next_scrub: uint64_t,
|
||||
},
|
||||
}, */
|
||||
},
|
||||
},
|
||||
inode: {
|
||||
stats: {
|
||||
/* <pool_id>: {
|
||||
<inode_t>: {
|
||||
raw_used: uint64_t, // raw used bytes on OSDs
|
||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
||||
},
|
||||
}, */
|
||||
},
|
||||
},
|
||||
pool: {
|
||||
stats: {
|
||||
/* <pool_id>: {
|
||||
used_raw_tb: float, // used raw space in the pool
|
||||
total_raw_tb: float, // maximum amount of space in the pool
|
||||
raw_to_usable: float, // raw to usable ratio
|
||||
space_efficiency: float, // 0..1
|
||||
} */
|
||||
},
|
||||
},
|
||||
stats: {
|
||||
/* op_stats: {
|
||||
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
|
||||
},
|
||||
subop_stats: {
|
||||
<string>: { count: uint64_t, usec: uint64_t, iops: uint64_t, lat: uint64_t },
|
||||
},
|
||||
recovery_stats: {
|
||||
degraded: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
|
||||
misplaced: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
|
||||
},
|
||||
object_counts: {
|
||||
object: uint64_t,
|
||||
clean: uint64_t,
|
||||
misplaced: uint64_t,
|
||||
degraded: uint64_t,
|
||||
incomplete: uint64_t,
|
||||
},
|
||||
object_bytes: {
|
||||
total: uint64_t,
|
||||
clean: uint64_t,
|
||||
misplaced: uint64_t,
|
||||
degraded: uint64_t,
|
||||
incomplete: uint64_t,
|
||||
}, */
|
||||
},
|
||||
history: {
|
||||
last_clean_pgs: {},
|
||||
},
|
||||
index: {
|
||||
image: {
|
||||
/* <name>: {
|
||||
id: uint64_t,
|
||||
pool_id: uint64_t,
|
||||
}, */
|
||||
},
|
||||
maxid: {
|
||||
/* <pool_id>: uint64_t, */
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
etcd_nonempty_keys,
|
||||
etcd_allow,
|
||||
etcd_tree,
|
||||
};
|
@@ -8,7 +8,7 @@
|
||||
// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().
|
||||
|
||||
const { SimpleCombinator } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
const LPOptimizer = require('./lp_optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
ripper5: {
|
@@ -2,7 +2,7 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
const LPOptimizer = require('./lp_optimizer.js');
|
||||
|
||||
async function run()
|
||||
{
|
@@ -2,7 +2,7 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat, flatten_tree } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
const LPOptimizer = require('./lp_optimizer.js');
|
||||
|
||||
const crush_tree = [
|
||||
{ level: 1, children: [
|
@@ -2,7 +2,7 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
const LPOptimizer = require('./lp_optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
100: {
|
@@ -2,7 +2,7 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat, flatten_tree } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
const LPOptimizer = require('./lp_optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
100: {
|
@@ -23,4 +23,4 @@ for (let i = 2; i < process.argv.length; i++)
|
||||
}
|
||||
}
|
||||
|
||||
new Mon(options).start().catch(e => { console.error(e); process.exit(1); });
|
||||
Mon.run_forever(options);
|
||||
|
1888
mon/mon.js
1888
mon/mon.js
File diff suppressed because it is too large
Load Diff
215
mon/osd_tree.js
Normal file
215
mon/osd_tree.js
Normal file
@@ -0,0 +1,215 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
function get_osd_tree(global_config, state)
|
||||
{
|
||||
const levels = global_config.placement_levels||{};
|
||||
levels.host = levels.host || 100;
|
||||
levels.osd = levels.osd || 101;
|
||||
const tree = {};
|
||||
let up_osds = {};
|
||||
// This requires monitor system time to be in sync with OSD system times (at least to some extent)
|
||||
const down_time = Date.now()/1000 - global_config.osd_out_time;
|
||||
for (const osd_num of Object.keys(state.osd.stats).sort((a, b) => a - b))
|
||||
{
|
||||
const stat = state.osd.stats[osd_num];
|
||||
const osd_cfg = state.config.osd[osd_num];
|
||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
||||
if (reweight < 0 || isNaN(reweight))
|
||||
reweight = 1;
|
||||
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
||||
osd_cfg && osd_cfg.noout))
|
||||
{
|
||||
// Numeric IDs are reserved for OSDs
|
||||
if (state.osd.state[osd_num] && reweight > 0)
|
||||
{
|
||||
// React to down OSDs immediately
|
||||
up_osds[osd_num] = true;
|
||||
}
|
||||
tree[osd_num] = tree[osd_num] || {};
|
||||
tree[osd_num].id = osd_num;
|
||||
tree[osd_num].parent = tree[osd_num].parent || stat.host;
|
||||
tree[osd_num].level = 'osd';
|
||||
tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
|
||||
if (osd_cfg && osd_cfg.tags)
|
||||
{
|
||||
tree[osd_num].tags = (osd_cfg.tags instanceof Array ? [ ...osd_cfg.tags ] : [ osd_cfg.tags ])
|
||||
.reduce((a, c) => { a[c] = true; return a; }, {});
|
||||
}
|
||||
delete tree[osd_num].children;
|
||||
if (!tree[stat.host])
|
||||
{
|
||||
tree[stat.host] = {
|
||||
id: stat.host,
|
||||
level: 'host',
|
||||
parent: null,
|
||||
children: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const node_id in state.config.node_placement||{})
|
||||
{
|
||||
const node_cfg = state.config.node_placement[node_id];
|
||||
if (/^\d+$/.exec(node_id))
|
||||
{
|
||||
node_cfg.level = 'osd';
|
||||
}
|
||||
if (!node_id || !node_cfg.level || !levels[node_cfg.level] ||
|
||||
node_cfg.level === 'osd' && !tree[node_id])
|
||||
{
|
||||
// All nodes must have non-empty IDs and valid levels
|
||||
// OSDs have to actually exist
|
||||
continue;
|
||||
}
|
||||
tree[node_id] = tree[node_id] || {};
|
||||
tree[node_id].id = node_id;
|
||||
tree[node_id].level = node_cfg.level;
|
||||
tree[node_id].parent = node_cfg.parent;
|
||||
if (node_cfg.level !== 'osd')
|
||||
{
|
||||
tree[node_id].children = [];
|
||||
}
|
||||
}
|
||||
return { up_osds, levels, osd_tree: tree };
|
||||
}
|
||||
|
||||
function make_hier_tree(global_config, tree)
|
||||
{
|
||||
const levels = global_config.placement_levels||{};
|
||||
levels.host = levels.host || 100;
|
||||
levels.osd = levels.osd || 101;
|
||||
tree = { ...tree };
|
||||
for (const node_id in tree)
|
||||
{
|
||||
tree[node_id] = { ...tree[node_id], children: [] };
|
||||
}
|
||||
tree[''] = { children: [] };
|
||||
for (const node_id in tree)
|
||||
{
|
||||
if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const node_cfg = tree[node_id];
|
||||
const node_level = levels[node_cfg.level] || node_cfg.level;
|
||||
let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
|
||||
&& tree[node_cfg.parent].level;
|
||||
parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
|
||||
// Parent's level must be less than child's; OSDs must be leaves
|
||||
const parent = parent_level && parent_level < node_level ? node_cfg.parent : '';
|
||||
tree[parent].children.push(tree[node_id]);
|
||||
}
|
||||
// Delete empty nodes
|
||||
let deleted = 0;
|
||||
do
|
||||
{
|
||||
deleted = 0;
|
||||
for (const node_id in tree)
|
||||
{
|
||||
if (tree[node_id].level !== 'osd' && (!tree[node_id].children || !tree[node_id].children.length))
|
||||
{
|
||||
const parent = tree[node_id].parent;
|
||||
if (parent)
|
||||
{
|
||||
tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
|
||||
}
|
||||
deleted++;
|
||||
delete tree[node_id];
|
||||
}
|
||||
}
|
||||
} while (deleted > 0);
|
||||
return tree;
|
||||
}
|
||||
|
||||
function filter_osds_by_root_node(global_config, pool_tree, root_node)
|
||||
{
|
||||
if (!root_node)
|
||||
{
|
||||
return;
|
||||
}
|
||||
let hier_tree = make_hier_tree(global_config, pool_tree);
|
||||
let included = [ ...(hier_tree[root_node] || {}).children||[] ];
|
||||
for (let i = 0; i < included.length; i++)
|
||||
{
|
||||
if (included[i].children)
|
||||
{
|
||||
included.splice(i+1, 0, ...included[i].children);
|
||||
}
|
||||
}
|
||||
let cur = pool_tree[root_node] || {};
|
||||
while (cur && cur.id)
|
||||
{
|
||||
included.unshift(cur);
|
||||
cur = pool_tree[cur.parent||''];
|
||||
}
|
||||
included = included.reduce((a, c) => { a[c.id||''] = true; return a; }, {});
|
||||
for (const item in pool_tree)
|
||||
{
|
||||
if (!included[item])
|
||||
{
|
||||
delete pool_tree[item];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function filter_osds_by_tags(orig_tree, tags)
|
||||
{
|
||||
if (!tags)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (const tag of (tags instanceof Array ? tags : [ tags ]))
|
||||
{
|
||||
for (const osd in orig_tree)
|
||||
{
|
||||
if (orig_tree[osd].level === 'osd' &&
|
||||
(!orig_tree[osd].tags || !orig_tree[osd].tags[tag]))
|
||||
{
|
||||
delete orig_tree[osd];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function filter_osds_by_block_layout(orig_tree, osd_stats, block_size, bitmap_granularity, immediate_commit)
|
||||
{
|
||||
for (const osd in orig_tree)
|
||||
{
|
||||
if (orig_tree[osd].level === 'osd')
|
||||
{
|
||||
const osd_stat = osd_stats[osd];
|
||||
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
|
||||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
|
||||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
|
||||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
|
||||
{
|
||||
delete orig_tree[osd];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function get_affinity_osds(pool_cfg, up_osds, osd_tree)
|
||||
{
|
||||
let aff_osds = up_osds;
|
||||
if (pool_cfg.primary_affinity_tags)
|
||||
{
|
||||
aff_osds = Object.keys(up_osds).reduce((a, c) => { a[c] = osd_tree[c]; return a; }, {});
|
||||
filter_osds_by_tags(aff_osds, pool_cfg.primary_affinity_tags);
|
||||
for (const osd in aff_osds)
|
||||
{
|
||||
aff_osds[osd] = true;
|
||||
}
|
||||
}
|
||||
return aff_osds;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
get_osd_tree,
|
||||
make_hier_tree,
|
||||
filter_osds_by_root_node,
|
||||
filter_osds_by_tags,
|
||||
filter_osds_by_block_layout,
|
||||
get_affinity_osds,
|
||||
};
|
@@ -4,7 +4,7 @@
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
"lint": "eslint *.js lp_optimizer/*.js scripts/*.js"
|
||||
},
|
||||
"author": "Vitaliy Filippov",
|
||||
"license": "UNLICENSED",
|
||||
@@ -14,12 +14,10 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"eslint": "^8.0.0",
|
||||
"eslint-plugin-import": "^2.29.1",
|
||||
"eslint-plugin-node": "^11.1.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"lint": "eslint *.js"
|
||||
}
|
||||
}
|
||||
|
267
mon/pg_gen.js
Normal file
267
mon/pg_gen.js
Normal file
@@ -0,0 +1,267 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { RuleCombinator } = require('./lp_optimizer/dsl_pgs.js');
|
||||
const { SimpleCombinator, flatten_tree } = require('./lp_optimizer/simple_pgs.js');
|
||||
const { validate_pool_cfg, get_pg_rules } = require('./pool_config.js');
|
||||
const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
|
||||
const { scale_pg_count } = require('./pg_utils.js');
|
||||
const { make_hier_tree, filter_osds_by_root_node,
|
||||
filter_osds_by_tags, filter_osds_by_block_layout, get_affinity_osds } = require('./osd_tree.js');
|
||||
|
||||
let seed;
|
||||
|
||||
function reset_rng()
|
||||
{
|
||||
seed = 0x5f020e43;
|
||||
}
|
||||
|
||||
function rng()
|
||||
{
|
||||
seed ^= seed << 13;
|
||||
seed ^= seed >> 17;
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
}
|
||||
|
||||
function pick_primary(pool_config, osd_set, up_osds, aff_osds)
|
||||
{
|
||||
let alive_set;
|
||||
if (pool_config.scheme === 'replicated')
|
||||
{
|
||||
// Prefer "affinity" OSDs
|
||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Prefer data OSDs for EC because they can actually read something without an additional network hop
|
||||
const pg_data_size = (pool_config.pg_size||0) - (pool_config.parity_chunks||0);
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
{
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
}
|
||||
if (!alive_set.length)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
return alive_set[rng() % alive_set.length];
|
||||
}
|
||||
|
||||
function recheck_primary(state, global_config, up_osds, osd_tree)
|
||||
{
|
||||
let new_config_pgs;
|
||||
for (const pool_id in state.config.pools)
|
||||
{
|
||||
const pool_cfg = state.config.pools[pool_id];
|
||||
if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const aff_osds = get_affinity_osds(pool_cfg, up_osds, osd_tree);
|
||||
reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
if (!state.config.pgs.items[pool_id])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const pg_cfg = state.config.pgs.items[pool_id][pg_num];
|
||||
if (pg_cfg)
|
||||
{
|
||||
const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
|
||||
if (pg_cfg.primary != new_primary)
|
||||
{
|
||||
if (!new_config_pgs)
|
||||
{
|
||||
new_config_pgs = JSON.parse(JSON.stringify(state.config.pgs));
|
||||
}
|
||||
console.log(
|
||||
`Moving pool ${pool_id} (${pool_cfg.name || 'unnamed'}) PG ${pg_num}`+
|
||||
` primary OSD from ${pg_cfg.primary} to ${new_primary}`
|
||||
);
|
||||
new_config_pgs.items[pool_id][pg_num].primary = new_primary;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return new_config_pgs;
|
||||
}
|
||||
|
||||
function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
|
||||
{
|
||||
const aff_osds = get_affinity_osds(state.config.pools[pool_id] || {}, up_osds, osd_tree);
|
||||
const pg_items = {};
|
||||
reset_rng();
|
||||
new_pgs.map((osd_set, i) =>
|
||||
{
|
||||
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
|
||||
pg_items[i+1] = {
|
||||
osd_set,
|
||||
primary: pick_primary(state.config.pools[pool_id], osd_set, up_osds, aff_osds),
|
||||
};
|
||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
||||
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
||||
{
|
||||
pg_history[i] = pg_history[i] || {};
|
||||
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
|
||||
pg_history[i].osd_sets.push(prev_pgs[i]);
|
||||
}
|
||||
if (pg_history[i] && pg_history[i].osd_sets)
|
||||
{
|
||||
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
|
||||
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
|
||||
}
|
||||
});
|
||||
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
|
||||
{
|
||||
// FIXME: etcd has max_txn_ops limit, and it's 128 by default
|
||||
// Sooo we probably want to change our storage scheme for PG histories...
|
||||
request.compare.push({
|
||||
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
|
||||
target: 'MOD',
|
||||
mod_revision: ''+etcd_watch_revision,
|
||||
result: 'LESS',
|
||||
});
|
||||
if (pg_history[i])
|
||||
{
|
||||
request.success.push({
|
||||
requestPut: {
|
||||
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
|
||||
value: b64(JSON.stringify(pg_history[i])),
|
||||
},
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
request.success.push({
|
||||
requestDeleteRange: {
|
||||
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
save_to.items = save_to.items || {};
|
||||
if (!new_pgs.length)
|
||||
{
|
||||
delete save_to.items[pool_id];
|
||||
}
|
||||
else
|
||||
{
|
||||
save_to.items[pool_id] = pg_items;
|
||||
}
|
||||
}
|
||||
|
||||
async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels)
|
||||
{
|
||||
const pool_cfg = state.config.pools[pool_id];
|
||||
if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
let pool_tree = { ...osd_tree };
|
||||
filter_osds_by_root_node(global_config, pool_tree, pool_cfg.root_node);
|
||||
filter_osds_by_tags(pool_tree, pool_cfg.osd_tags);
|
||||
filter_osds_by_block_layout(
|
||||
pool_tree,
|
||||
state.osd.stats,
|
||||
pool_cfg.block_size || global_config.block_size || 131072,
|
||||
pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
|
||||
pool_cfg.immediate_commit || global_config.immediate_commit || 'none'
|
||||
);
|
||||
pool_tree = make_hier_tree(global_config, pool_tree);
|
||||
// First try last_clean_pgs to minimize data movement
|
||||
let prev_pgs = [];
|
||||
for (const pg in ((state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = [ ...state.history.last_clean_pgs.items[pool_id][pg].osd_set ];
|
||||
}
|
||||
if (!prev_pgs.length)
|
||||
{
|
||||
// Fall back to config/pgs if it's empty
|
||||
for (const pg in ((state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = [ ...state.config.pgs.items[pool_id][pg].osd_set ];
|
||||
}
|
||||
}
|
||||
const old_pg_count = prev_pgs.length;
|
||||
const optimize_cfg = {
|
||||
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
|
||||
combinator: !global_config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
|
||||
// new algorithm:
|
||||
? new RuleCombinator(pool_tree, get_pg_rules(pool_id, pool_cfg, global_config.placement_levels), pool_cfg.max_osd_combinations)
|
||||
// old algorithm:
|
||||
: new SimpleCombinator(flatten_tree(pool_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
ordered: pool_cfg.scheme != 'replicated',
|
||||
};
|
||||
let optimize_result;
|
||||
// Re-shuffle PGs if config/pgs.hash is empty
|
||||
if (old_pg_count > 0 && state.config.pgs.hash)
|
||||
{
|
||||
if (prev_pgs.length != pool_cfg.pg_count)
|
||||
{
|
||||
// Scale PG count
|
||||
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
|
||||
// because last_clean_pgs may still contain the old number of PGs
|
||||
scale_pg_count(prev_pgs, pool_cfg.pg_count);
|
||||
}
|
||||
for (const pg of prev_pgs)
|
||||
{
|
||||
while (pg.length < pool_cfg.pg_size)
|
||||
{
|
||||
pg.push(0);
|
||||
}
|
||||
}
|
||||
optimize_result = await LPOptimizer.optimize_change({
|
||||
prev_pgs,
|
||||
...optimize_cfg,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
|
||||
LPOptimizer.print_change_stats(optimize_result);
|
||||
let pg_effsize = pool_cfg.pg_size;
|
||||
for (const pg of optimize_result.int_pgs)
|
||||
{
|
||||
const this_pg_size = pg.filter(osd => osd != LPOptimizer.NO_OSD).length;
|
||||
if (this_pg_size && this_pg_size < pg_effsize)
|
||||
{
|
||||
pg_effsize = this_pg_size;
|
||||
}
|
||||
}
|
||||
return {
|
||||
pool_id,
|
||||
pgs: optimize_result.int_pgs,
|
||||
stats: {
|
||||
total_raw_tb: optimize_result.space,
|
||||
pg_real_size: pg_effsize || pool_cfg.pg_size,
|
||||
raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated'
|
||||
? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
|
||||
space_efficiency: optimize_result.space/(optimize_result.total_space||1),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function b64(str)
|
||||
{
|
||||
return Buffer.from(str).toString('base64');
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
recheck_primary,
|
||||
save_new_pgs_txn,
|
||||
generate_pool_pgs,
|
||||
};
|
169
mon/pool_config.js
Normal file
169
mon/pool_config.js
Normal file
@@ -0,0 +1,169 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { parse_level_indexes, parse_pg_dsl } = require('./lp_optimizer/dsl_pgs.js');
|
||||
|
||||
function validate_pool_cfg(pool_id, pool_cfg, placement_levels, warn)
|
||||
{
|
||||
pool_cfg.pg_size = Math.floor(pool_cfg.pg_size);
|
||||
pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
|
||||
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
|
||||
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
|
||||
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
|
||||
if (!/^[1-9]\d*$/.exec(''+pool_id))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool ID '+pool_id+' is invalid');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' &&
|
||||
pool_cfg.scheme !== 'ec' && pool_cfg.scheme !== 'jerasure')
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated", "ec" and "jerasure" required)');
|
||||
return false;
|
||||
}
|
||||
if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
|
||||
pool_cfg.scheme !== 'replicated' && pool_cfg.pg_size < 3)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid pg_size');
|
||||
return false;
|
||||
}
|
||||
if (!pool_cfg.pg_minsize || pool_cfg.pg_minsize < 1 || pool_cfg.pg_minsize > pool_cfg.pg_size ||
|
||||
pool_cfg.scheme === 'xor' && pool_cfg.pg_minsize < (pool_cfg.pg_size - 1))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid pg_minsize');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.scheme === 'xor' && pool_cfg.parity_chunks != 0 && pool_cfg.parity_chunks != 1)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
|
||||
return false;
|
||||
}
|
||||
if ((pool_cfg.scheme === 'ec' || pool_cfg.scheme === 'jerasure') &&
|
||||
(pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
|
||||
return false;
|
||||
}
|
||||
if (!pool_cfg.pg_count || pool_cfg.pg_count < 1)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid pg_count');
|
||||
return false;
|
||||
}
|
||||
if (!pool_cfg.name)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has empty name');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.max_osd_combinations < 100)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.root_node && typeof(pool_cfg.root_node) != 'string')
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid root_node (must be a string)');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.osd_tags && typeof(pool_cfg.osd_tags) != 'string' &&
|
||||
(!(pool_cfg.osd_tags instanceof Array) || pool_cfg.osd_tags.filter(t => typeof t != 'string').length > 0))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' &&
|
||||
(!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
if (!get_pg_rules(pool_id, pool_cfg, placement_levels, true))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function get_pg_rules(pool_id, pool_cfg, placement_levels, warn)
|
||||
{
|
||||
if (pool_cfg.level_placement)
|
||||
{
|
||||
const pg_size = (0|pool_cfg.pg_size);
|
||||
let rules = pool_cfg.level_placement;
|
||||
if (typeof rules === 'string')
|
||||
{
|
||||
rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
|
||||
}
|
||||
else
|
||||
{
|
||||
rules = { ...rules };
|
||||
}
|
||||
// Always add failure_domain to prevent rules from being totally incorrect
|
||||
const all_diff = [];
|
||||
for (let i = 1; i <= pg_size; i++)
|
||||
{
|
||||
all_diff.push(i);
|
||||
}
|
||||
rules[pool_cfg.failure_domain || 'host'] = all_diff;
|
||||
placement_levels = placement_levels||{};
|
||||
placement_levels.host = placement_levels.host || 100;
|
||||
placement_levels.osd = placement_levels.osd || 101;
|
||||
for (const k in rules)
|
||||
{
|
||||
if (!placement_levels[k] || typeof rules[k] !== 'string' &&
|
||||
(!(rules[k] instanceof Array) ||
|
||||
rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
|
||||
return null;
|
||||
}
|
||||
else if (rules[k].length != pg_size)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return parse_level_indexes(rules);
|
||||
}
|
||||
else if (typeof pool_cfg.raw_placement === 'string')
|
||||
{
|
||||
try
|
||||
{
|
||||
return parse_pg_dsl(pool_cfg.raw_placement);
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
let rules = [ [] ];
|
||||
let prev = [ 1 ];
|
||||
for (let i = 1; i < pool_cfg.pg_size; i++)
|
||||
{
|
||||
rules.push([ [ pool_cfg.failure_domain||'host', '!=', prev ] ]);
|
||||
prev = [ ...prev, i+1 ];
|
||||
}
|
||||
return rules;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
validate_pool_cfg,
|
||||
get_pg_rules,
|
||||
};
|
286
mon/stats.js
Normal file
286
mon/stats.js
Normal file
@@ -0,0 +1,286 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
function derive_osd_stats(st, prev, prev_diff)
|
||||
{
|
||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
||||
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
|
||||
{
|
||||
return prev_diff || diff;
|
||||
}
|
||||
const timediff = BigInt(st.time*1000 - prev.time*1000);
|
||||
for (const op in st.op_stats||{})
|
||||
{
|
||||
const pr = prev && prev.op_stats && prev.op_stats[op];
|
||||
let c = st.op_stats[op];
|
||||
c = { bytes: BigInt(c.bytes||0), usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
|
||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
if (n > 0)
|
||||
diff.op_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff, lat: us/n };
|
||||
}
|
||||
for (const op in st.subop_stats||{})
|
||||
{
|
||||
const pr = prev && prev.subop_stats && prev.subop_stats[op];
|
||||
let c = st.subop_stats[op];
|
||||
c = { usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
|
||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
if (n > 0)
|
||||
diff.subop_stats[op] = { ...c, iops: n*1000n/timediff, lat: us/n };
|
||||
}
|
||||
for (const op in st.recovery_stats||{})
|
||||
{
|
||||
const pr = prev && prev.recovery_stats && prev.recovery_stats[op];
|
||||
let c = st.recovery_stats[op];
|
||||
c = { bytes: BigInt(c.bytes||0), count: BigInt(c.count||0) };
|
||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
if (n > 0)
|
||||
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
|
||||
}
|
||||
for (const pool_id in st.inode_stats||{})
|
||||
{
|
||||
diff.inode_stats[pool_id] = {};
|
||||
for (const inode_num in st.inode_stats[pool_id])
|
||||
{
|
||||
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
const c = st.inode_stats[pool_id][inode_num][op];
|
||||
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
|
||||
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
|
||||
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
|
||||
inode_diff[op] = {
|
||||
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
|
||||
iops: n*1000n/timediff,
|
||||
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
// sum_op_stats(this.state.osd, this.prev_stats)
|
||||
function sum_op_stats(all_osd, prev_stats)
|
||||
{
|
||||
for (const osd in all_osd.stats)
|
||||
{
|
||||
const cur = { ...all_osd.stats[osd], inode_stats: all_osd.inodestats[osd]||{} };
|
||||
prev_stats.osd_diff[osd] = derive_osd_stats(
|
||||
cur, prev_stats.osd_stats[osd], prev_stats.osd_diff[osd]
|
||||
);
|
||||
prev_stats.osd_stats[osd] = cur;
|
||||
}
|
||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
// Sum derived values instead of deriving summed
|
||||
for (const osd in all_osd.state)
|
||||
{
|
||||
const derived = prev_stats.osd_diff[osd];
|
||||
if (!all_osd.state[osd] || !derived)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (const type in sum_diff)
|
||||
{
|
||||
for (const op in derived[type]||{})
|
||||
{
|
||||
for (const k in derived[type][op])
|
||||
{
|
||||
sum_diff[type][op] = sum_diff[type][op] || {};
|
||||
sum_diff[type][op][k] = (sum_diff[type][op][k] || 0n) + derived[type][op][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum_diff;
|
||||
}
|
||||
|
||||
// sum_object_counts(this.state, this.config)
|
||||
function sum_object_counts(state, global_config)
|
||||
{
|
||||
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
|
||||
const object_bytes = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
|
||||
for (const pool_id in state.pg.stats)
|
||||
{
|
||||
let object_size = 0;
|
||||
for (const osd_num of state.pg.stats[pool_id].write_osd_set||[])
|
||||
{
|
||||
if (osd_num && state.osd.stats[osd_num] && state.osd.stats[osd_num].block_size)
|
||||
{
|
||||
object_size = state.osd.stats[osd_num].block_size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
const pool_cfg = (state.config.pools[pool_id]||{});
|
||||
if (!object_size)
|
||||
{
|
||||
object_size = pool_cfg.block_size || global_config.block_size || 131072;
|
||||
}
|
||||
if (pool_cfg.scheme !== 'replicated')
|
||||
{
|
||||
object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
|
||||
}
|
||||
object_size = BigInt(object_size);
|
||||
for (const pg_num in state.pg.stats[pool_id])
|
||||
{
|
||||
const st = state.pg.stats[pool_id][pg_num];
|
||||
if (st)
|
||||
{
|
||||
for (const k in object_counts)
|
||||
{
|
||||
if (st[k+'_count'])
|
||||
{
|
||||
object_counts[k] += BigInt(st[k+'_count']);
|
||||
object_bytes[k] += BigInt(st[k+'_count']) * object_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return { object_counts, object_bytes };
|
||||
}
|
||||
|
||||
// sum_inode_stats(this.state, this.prev_stats)
|
||||
function sum_inode_stats(state, prev_stats)
|
||||
{
|
||||
const inode_stats = {};
|
||||
const inode_stub = () => ({
|
||||
raw_used: 0n,
|
||||
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
||||
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
||||
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
||||
});
|
||||
const seen_pools = {};
|
||||
for (const pool_id in state.config.pools)
|
||||
{
|
||||
seen_pools[pool_id] = true;
|
||||
state.pool.stats[pool_id] = state.pool.stats[pool_id] || {};
|
||||
state.pool.stats[pool_id].used_raw_tb = 0n;
|
||||
}
|
||||
for (const osd_num in state.osd.space)
|
||||
{
|
||||
for (const pool_id in state.osd.space[osd_num])
|
||||
{
|
||||
state.pool.stats[pool_id] = state.pool.stats[pool_id] || {};
|
||||
if (!seen_pools[pool_id])
|
||||
{
|
||||
state.pool.stats[pool_id].used_raw_tb = 0n;
|
||||
seen_pools[pool_id] = true;
|
||||
}
|
||||
inode_stats[pool_id] = inode_stats[pool_id] || {};
|
||||
for (const inode_num in state.osd.space[osd_num][pool_id])
|
||||
{
|
||||
const u = BigInt(state.osd.space[osd_num][pool_id][inode_num]||0);
|
||||
if (inode_num)
|
||||
{
|
||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
||||
inode_stats[pool_id][inode_num].raw_used += u;
|
||||
}
|
||||
state.pool.stats[pool_id].used_raw_tb += u;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const pool_id in seen_pools)
|
||||
{
|
||||
const used = state.pool.stats[pool_id].used_raw_tb;
|
||||
state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
|
||||
}
|
||||
for (const osd_num in state.osd.state)
|
||||
{
|
||||
const ist = state.osd.inodestats[osd_num];
|
||||
if (!ist || !state.osd.state[osd_num])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (const pool_id in ist)
|
||||
{
|
||||
inode_stats[pool_id] = inode_stats[pool_id] || {};
|
||||
for (const inode_num in ist[pool_id])
|
||||
{
|
||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
inode_stats[pool_id][inode_num][op].count += BigInt(ist[pool_id][inode_num][op].count||0);
|
||||
inode_stats[pool_id][inode_num][op].usec += BigInt(ist[pool_id][inode_num][op].usec||0);
|
||||
inode_stats[pool_id][inode_num][op].bytes += BigInt(ist[pool_id][inode_num][op].bytes||0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const osd in state.osd.state)
|
||||
{
|
||||
const osd_diff = prev_stats.osd_diff[osd];
|
||||
if (!osd_diff || !state.osd.state[osd])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (const pool_id in osd_diff.inode_stats)
|
||||
{
|
||||
for (const inode_num in prev_stats.osd_diff[osd].inode_stats[pool_id])
|
||||
{
|
||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
const op_diff = prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
|
||||
const op_st = inode_stats[pool_id][inode_num][op];
|
||||
op_st.bps += op_diff.bps;
|
||||
op_st.iops += op_diff.iops;
|
||||
op_st.lat += op_diff.lat;
|
||||
op_st.n_osd = (op_st.n_osd || 0) + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const pool_id in inode_stats)
|
||||
{
|
||||
for (const inode_num in inode_stats[pool_id])
|
||||
{
|
||||
let nonzero = inode_stats[pool_id][inode_num].raw_used > 0;
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
const op_st = inode_stats[pool_id][inode_num][op];
|
||||
if (op_st.n_osd)
|
||||
{
|
||||
op_st.lat /= BigInt(op_st.n_osd);
|
||||
delete op_st.n_osd;
|
||||
}
|
||||
if (op_st.bps > 0 || op_st.iops > 0)
|
||||
nonzero = true;
|
||||
}
|
||||
if (!nonzero && (!state.config.inode[pool_id] || !state.config.inode[pool_id][inode_num]))
|
||||
{
|
||||
// Deleted inode (no data, no I/O, no config)
|
||||
delete inode_stats[pool_id][inode_num];
|
||||
}
|
||||
}
|
||||
}
|
||||
return { inode_stats, seen_pools };
|
||||
}
|
||||
|
||||
function serialize_bigints(obj)
|
||||
{
|
||||
obj = { ...obj };
|
||||
for (const k in obj)
|
||||
{
|
||||
if (typeof obj[k] == 'bigint')
|
||||
{
|
||||
obj[k] = ''+obj[k];
|
||||
}
|
||||
else if (typeof obj[k] == 'object')
|
||||
{
|
||||
obj[k] = serialize_bigints(obj[k]);
|
||||
}
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
derive_osd_stats,
|
||||
sum_op_stats,
|
||||
sum_object_counts,
|
||||
sum_inode_stats,
|
||||
serialize_bigints,
|
||||
};
|
@@ -707,10 +707,10 @@ class VitastorDriver(driver.CloneableImageVD,
|
||||
return ({}, True)
|
||||
return ({}, False)
|
||||
|
||||
def copy_image_to_encrypted_volume(self, context, volume, image_service, image_id):
|
||||
self.copy_image_to_volume(context, volume, image_service, image_id, encrypted = True)
|
||||
def copy_image_to_encrypted_volume(self, context, volume, image_service, image_id, disable_sparse=False):
|
||||
self.copy_image_to_volume(context, volume, image_service, image_id, encrypted = True, disable_sparse=False)
|
||||
|
||||
def copy_image_to_volume(self, context, volume, image_service, image_id, encrypted = False):
|
||||
def copy_image_to_volume(self, context, volume, image_service, image_id, encrypted = False, disable_sparse=False):
|
||||
tmp_dir = volume_utils.image_conversion_dir()
|
||||
with tempfile.NamedTemporaryFile(dir = tmp_dir) as tmp:
|
||||
image_utils.fetch_to_raw(
|
||||
|
670
patches/libvirt-10.0-vitastor.diff
Normal file
670
patches/libvirt-10.0-vitastor.diff
Normal file
@@ -0,0 +1,670 @@
|
||||
From 571bde71268dcca6446454bb1e895e21bcc7b2a0 Mon Sep 17 00:00:00 2001
|
||||
From: ace <ace@0xace.cc>
|
||||
Date: Sat, 18 May 2024 19:45:49 +0300
|
||||
Subject: [PATCH] Add Vitastor support
|
||||
|
||||
---
|
||||
include/libvirt/libvirt-storage.h | 1 +
|
||||
src/conf/domain_conf.c | 4 +-
|
||||
src/conf/domain_validate.c | 10 +-
|
||||
src/conf/schemas/domaincommon.rng | 30 +++++
|
||||
src/conf/storage_conf.c | 20 ++-
|
||||
src/conf/storage_conf.h | 2 +
|
||||
src/conf/storage_source_conf.c | 2 +
|
||||
src/conf/storage_source_conf.h | 1 +
|
||||
src/conf/virstorageobj.c | 3 +
|
||||
src/libvirt-storage.c | 1 +
|
||||
src/libxl/libxl_conf.c | 1 +
|
||||
src/libxl/xen_xl.c | 1 +
|
||||
src/qemu/qemu_block.c | 45 +++++++
|
||||
src/qemu/qemu_domain.c | 4 +-
|
||||
src/qemu/qemu_snapshot.c | 2 +
|
||||
src/storage/storage_driver.c | 1 +
|
||||
.../storage_source_backingstore.c | 123 ++++++++++++++++++
|
||||
src/test/test_driver.c | 1 +
|
||||
.../storagepoolcapsschemadata/poolcaps-fs.xml | 7 +
|
||||
.../poolcaps-full.xml | 7 +
|
||||
tests/storagepoolxml2argvtest.c | 1 +
|
||||
tools/virsh-pool.c | 3 +
|
||||
22 files changed, 265 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
|
||||
index aaad4a3da1..5f5daa8341 100644
|
||||
--- a/include/libvirt/libvirt-storage.h
|
||||
+++ b/include/libvirt/libvirt-storage.h
|
||||
@@ -326,6 +326,7 @@ typedef enum {
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
|
||||
} virConnectListAllStoragePoolsFlags;
|
||||
|
||||
int virConnectListAllStoragePools(virConnectPtr conn,
|
||||
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
|
||||
index 52a5796ad2..089697b2a3 100644
|
||||
--- a/src/conf/domain_conf.c
|
||||
+++ b/src/conf/domain_conf.c
|
||||
@@ -7191,7 +7191,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
|
||||
src->configFile = virXPathString("string(./config/@file)", ctxt);
|
||||
|
||||
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
|
||||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
|
||||
src->query = virXMLPropString(node, "query");
|
||||
|
||||
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
|
||||
@@ -30657,6 +30658,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_POOL_MPATH:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
|
||||
index faa7659f07..01b907d60d 100644
|
||||
--- a/src/conf/domain_validate.c
|
||||
+++ b/src/conf/domain_validate.c
|
||||
@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
}
|
||||
}
|
||||
|
||||
- /* internal snapshots and config files are currently supported only with rbd: */
|
||||
+ /* internal snapshots are currently supported only with rbd: */
|
||||
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
|
||||
if (src->snapshot) {
|
||||
@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
_("<snapshot> element is currently supported only with 'rbd' disks"));
|
||||
return -1;
|
||||
}
|
||||
+ }
|
||||
|
||||
+ /* config files are currently supported only with rbd and vitastor: */
|
||||
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
|
||||
if (src->configFile) {
|
||||
virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
- _("<config> element is currently supported only with 'rbd' disks"));
|
||||
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
|
||||
index df44cd9857..4bb72fc697 100644
|
||||
--- a/src/conf/schemas/domaincommon.rng
|
||||
+++ b/src/conf/schemas/domaincommon.rng
|
||||
@@ -1997,6 +1997,35 @@
|
||||
</element>
|
||||
</define>
|
||||
|
||||
+ <define name="diskSourceNetworkProtocolVitastor">
|
||||
+ <element name="source">
|
||||
+ <interleave>
|
||||
+ <attribute name="protocol">
|
||||
+ <value>vitastor</value>
|
||||
+ </attribute>
|
||||
+ <ref name="diskSourceCommon"/>
|
||||
+ <optional>
|
||||
+ <attribute name="name"/>
|
||||
+ </optional>
|
||||
+ <optional>
|
||||
+ <attribute name="query"/>
|
||||
+ </optional>
|
||||
+ <zeroOrMore>
|
||||
+ <ref name="diskSourceNetworkHost"/>
|
||||
+ </zeroOrMore>
|
||||
+ <optional>
|
||||
+ <element name="config">
|
||||
+ <attribute name="file">
|
||||
+ <ref name="absFilePath"/>
|
||||
+ </attribute>
|
||||
+ <empty/>
|
||||
+ </element>
|
||||
+ </optional>
|
||||
+ <empty/>
|
||||
+ </interleave>
|
||||
+ </element>
|
||||
+ </define>
|
||||
+
|
||||
<define name="diskSourceNetworkProtocolISCSI">
|
||||
<element name="source">
|
||||
<attribute name="protocol">
|
||||
@@ -2347,6 +2376,7 @@
|
||||
<ref name="diskSourceNetworkProtocolSimple"/>
|
||||
<ref name="diskSourceNetworkProtocolVxHS"/>
|
||||
<ref name="diskSourceNetworkProtocolNFS"/>
|
||||
+ <ref name="diskSourceNetworkProtocolVitastor"/>
|
||||
</choice>
|
||||
</define>
|
||||
|
||||
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
|
||||
index 68842004b7..1d69a788b6 100644
|
||||
--- a/src/conf/storage_conf.c
|
||||
+++ b/src/conf/storage_conf.c
|
||||
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
|
||||
"logical", "disk", "iscsi",
|
||||
"iscsi-direct", "scsi", "mpath",
|
||||
"rbd", "sheepdog", "gluster",
|
||||
- "zfs", "vstorage",
|
||||
+ "zfs", "vstorage", "vitastor",
|
||||
);
|
||||
|
||||
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
|
||||
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
|
||||
.formatToString = virStorageFileFormatTypeToString,
|
||||
}
|
||||
},
|
||||
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
|
||||
+ .poolOptions = {
|
||||
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NAME),
|
||||
+ },
|
||||
+ .volOptions = {
|
||||
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
|
||||
+ .formatFromString = virStorageVolumeFormatFromString,
|
||||
+ .formatToString = virStorageFileFormatTypeToString,
|
||||
+ }
|
||||
+ },
|
||||
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
|
||||
.poolOptions = {
|
||||
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
|
||||
_("element 'name' is mandatory for RBD pool"));
|
||||
return -1;
|
||||
}
|
||||
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
|
||||
+ virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
+ _("element 'name' is mandatory for Vitastor pool"));
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
if (options->formatFromString) {
|
||||
g_autofree char *format = NULL;
|
||||
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
|
||||
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
|
||||
* files, so they don't have a target */
|
||||
if (def->type != VIR_STORAGE_POOL_RBD &&
|
||||
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
|
||||
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
|
||||
def->type != VIR_STORAGE_POOL_GLUSTER &&
|
||||
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
|
||||
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
|
||||
index fc67957cfe..720c07ef74 100644
|
||||
--- a/src/conf/storage_conf.h
|
||||
+++ b/src/conf/storage_conf.h
|
||||
@@ -103,6 +103,7 @@ typedef enum {
|
||||
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
|
||||
VIR_STORAGE_POOL_ZFS, /* ZFS */
|
||||
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
|
||||
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
|
||||
|
||||
VIR_STORAGE_POOL_LAST,
|
||||
} virStoragePoolType;
|
||||
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
|
||||
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
|
||||
index 959ec5ed40..e751dd4d6a 100644
|
||||
--- a/src/conf/storage_source_conf.c
|
||||
+++ b/src/conf/storage_source_conf.c
|
||||
@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
|
||||
"ssh",
|
||||
"vxhs",
|
||||
"nfs",
|
||||
+ "vitastor",
|
||||
);
|
||||
|
||||
|
||||
@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
return 24007;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
/* we don't provide a default for RBD */
|
||||
return 0;
|
||||
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
|
||||
index 05b4bda16c..b5ed143c39 100644
|
||||
--- a/src/conf/storage_source_conf.h
|
||||
+++ b/src/conf/storage_source_conf.h
|
||||
@@ -129,6 +129,7 @@ typedef enum {
|
||||
VIR_STORAGE_NET_PROTOCOL_SSH,
|
||||
VIR_STORAGE_NET_PROTOCOL_VXHS,
|
||||
VIR_STORAGE_NET_PROTOCOL_NFS,
|
||||
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
|
||||
|
||||
VIR_STORAGE_NET_PROTOCOL_LAST
|
||||
} virStorageNetProtocol;
|
||||
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
|
||||
index 59fa5da372..4739167f5f 100644
|
||||
--- a/src/conf/virstorageobj.c
|
||||
+++ b/src/conf/virstorageobj.c
|
||||
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
|
||||
return 1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
|
||||
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
|
||||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
|
||||
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
|
||||
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
|
||||
index db7660aac4..561df34709 100644
|
||||
--- a/src/libvirt-storage.c
|
||||
+++ b/src/libvirt-storage.c
|
||||
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
|
||||
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
|
||||
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
|
||||
index 62e1be6672..71a1d42896 100644
|
||||
--- a/src/libxl/libxl_conf.c
|
||||
+++ b/src/libxl/libxl_conf.c
|
||||
@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
|
||||
index f175359307..8efcf4c329 100644
|
||||
--- a/src/libxl/xen_xl.c
|
||||
+++ b/src/libxl/xen_xl.c
|
||||
@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
|
||||
index c9f5cbbf29..dbbac36836 100644
|
||||
--- a/src/qemu/qemu_block.c
|
||||
+++ b/src/qemu/qemu_block.c
|
||||
@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
|
||||
}
|
||||
|
||||
|
||||
+static virJSONValue *
|
||||
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
|
||||
+{
|
||||
+ virJSONValue *ret = NULL;
|
||||
+ virStorageNetHostDef *host;
|
||||
+ size_t i;
|
||||
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
|
||||
+ g_autofree char *etcd = NULL;
|
||||
+
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ host = src->hosts + i;
|
||||
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
|
||||
+ return NULL;
|
||||
+ }
|
||||
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
|
||||
+ }
|
||||
+ if (src->nhosts > 0) {
|
||||
+ etcd = virBufferContentAndReset(&buf);
|
||||
+ }
|
||||
+
|
||||
+ if (virJSONValueObjectAdd(&ret,
|
||||
+ "S:etcd-host", etcd,
|
||||
+ "S:etcd-prefix", src->query,
|
||||
+ "S:config-path", src->configFile,
|
||||
+ "s:image", src->path,
|
||||
+ NULL) < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static virJSONValue *
|
||||
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
|
||||
{
|
||||
@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return NULL;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
@@ -2020,6 +2058,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
@@ -2400,6 +2439,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
|
||||
index 341c543280..61b248fa2c 100644
|
||||
--- a/src/qemu/qemu_domain.c
|
||||
+++ b/src/qemu/qemu_domain.c
|
||||
@@ -5207,7 +5207,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
|
||||
if (src->query &&
|
||||
(actualType != VIR_STORAGE_TYPE_NETWORK ||
|
||||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
|
||||
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
|
||||
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
||||
_("query is supported only with HTTP(S) protocols"));
|
||||
return -1;
|
||||
@@ -10387,6 +10388,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
|
||||
break;
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
|
||||
index 0cac0c4146..4955ebd8d4 100644
|
||||
--- a/src/qemu/qemu_snapshot.c
|
||||
+++ b/src/qemu/qemu_snapshot.c
|
||||
@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
|
||||
index 314fe930e0..fb615a8b4e 100644
|
||||
--- a/src/storage/storage_driver.c
|
||||
+++ b/src/storage/storage_driver.c
|
||||
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
|
||||
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
|
||||
index 80681924ea..8a3ade9ec0 100644
|
||||
--- a/src/storage_file/storage_source_backingstore.c
|
||||
+++ b/src/storage_file/storage_source_backingstore.c
|
||||
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
|
||||
}
|
||||
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseVitastorColonString(const char *colonstr,
|
||||
+ virStorageSource *src)
|
||||
+{
|
||||
+ char *p, *e, *next;
|
||||
+ g_autofree char *options = NULL;
|
||||
+
|
||||
+ /* optionally skip the "vitastor:" prefix if provided */
|
||||
+ if (STRPREFIX(colonstr, "vitastor:"))
|
||||
+ colonstr += strlen("vitastor:");
|
||||
+
|
||||
+ options = g_strdup(colonstr);
|
||||
+
|
||||
+ p = options;
|
||||
+ while (*p) {
|
||||
+ /* find : delimiter or end of string */
|
||||
+ for (e = p; *e && *e != ':'; ++e) {
|
||||
+ if (*e == '\\') {
|
||||
+ e++;
|
||||
+ if (*e == '\0')
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if (*e == '\0') {
|
||||
+ next = e; /* last kv pair */
|
||||
+ } else {
|
||||
+ next = e + 1;
|
||||
+ *e = '\0';
|
||||
+ }
|
||||
+
|
||||
+ if (STRPREFIX(p, "image=")) {
|
||||
+ src->path = g_strdup(p + strlen("image="));
|
||||
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
|
||||
+ src->query = g_strdup(p + strlen("etcd-prefix="));
|
||||
+ } else if (STRPREFIX(p, "config-path=")) {
|
||||
+ src->configFile = g_strdup(p + strlen("config-path="));
|
||||
+ } else if (STRPREFIX(p, "etcd-host=")) {
|
||||
+ char *h, *sep;
|
||||
+
|
||||
+ h = p + strlen("etcd-host=");
|
||||
+ while (h < e) {
|
||||
+ for (sep = h; sep < e; ++sep) {
|
||||
+ if (*sep == '\\' && (sep[1] == ',' ||
|
||||
+ sep[1] == ';' ||
|
||||
+ sep[1] == ' ')) {
|
||||
+ *sep = '\0';
|
||||
+ sep += 2;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (virStorageSourceRBDAddHost(src, h) < 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ h = sep;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p = next;
|
||||
+ }
|
||||
+
|
||||
+ if (!src->path) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseNBDColonString(const char *nbdstr,
|
||||
virStorageSource *src)
|
||||
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
|
||||
+ virJSONValue *json,
|
||||
+ const char *jsonstr G_GNUC_UNUSED,
|
||||
+ int opaque G_GNUC_UNUSED)
|
||||
+{
|
||||
+ const char *filename;
|
||||
+ const char *image = virJSONValueObjectGetString(json, "image");
|
||||
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
|
||||
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
|
||||
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
|
||||
+ size_t nservers;
|
||||
+ size_t i;
|
||||
+
|
||||
+ src->type = VIR_STORAGE_TYPE_NETWORK;
|
||||
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
|
||||
+
|
||||
+ /* legacy syntax passed via 'filename' option */
|
||||
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
|
||||
+ return virStorageSourceParseVitastorColonString(filename, src);
|
||||
+
|
||||
+ if (!image) {
|
||||
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
|
||||
+ _("missing image name in Vitastor backing volume "
|
||||
+ "JSON specification"));
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ src->path = g_strdup(image);
|
||||
+ src->configFile = g_strdup(conf);
|
||||
+ src->query = g_strdup(etcd_prefix);
|
||||
+
|
||||
+ if (servers) {
|
||||
+ nservers = virJSONValueArraySize(servers);
|
||||
+
|
||||
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
|
||||
+ src->nhosts = nservers;
|
||||
+
|
||||
+ for (i = 0; i < nservers; i++) {
|
||||
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
|
||||
+ virJSONValueArrayGet(servers, i)) < 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
|
||||
virJSONValue *json,
|
||||
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
|
||||
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
|
||||
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
|
||||
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
|
||||
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
|
||||
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
|
||||
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
|
||||
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
|
||||
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
|
||||
index ed545848af..dbfdbe8476 100644
|
||||
--- a/src/test/test_driver.c
|
||||
+++ b/src/test/test_driver.c
|
||||
@@ -7336,6 +7336,7 @@ testStorageVolumeTypeForPool(int pooltype)
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
return VIR_STORAGE_VOL_NETWORK;
|
||||
case VIR_STORAGE_POOL_LOGICAL:
|
||||
case VIR_STORAGE_POOL_DISK:
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
index eee75af746..8bd0a57bdd 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='no'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
index 805950a937..852df0de16 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='yes'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
|
||||
index e8e40d695e..db55fe5f3a 100644
|
||||
--- a/tests/storagepoolxml2argvtest.c
|
||||
+++ b/tests/storagepoolxml2argvtest.c
|
||||
@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
default:
|
||||
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
|
||||
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
|
||||
index 36f00cf643..5f5bd3464e 100644
|
||||
--- a/tools/virsh-pool.c
|
||||
+++ b/tools/virsh-pool.c
|
||||
@@ -1223,6 +1223,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
|
||||
break;
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
|
||||
+ break;
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
643
patches/libvirt-10.4-vitastor.diff
Normal file
643
patches/libvirt-10.4-vitastor.diff
Normal file
@@ -0,0 +1,643 @@
|
||||
commit 1f7e90e36b2afca0312392979b96d31951a8d66b
|
||||
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Date: Thu Jun 27 01:34:54 2024 +0300
|
||||
|
||||
Add Vitastor support
|
||||
|
||||
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
|
||||
index aaad4a3da1..5f5daa8341 100644
|
||||
--- a/include/libvirt/libvirt-storage.h
|
||||
+++ b/include/libvirt/libvirt-storage.h
|
||||
@@ -326,6 +326,7 @@ typedef enum {
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
|
||||
} virConnectListAllStoragePoolsFlags;
|
||||
|
||||
int virConnectListAllStoragePools(virConnectPtr conn,
|
||||
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
|
||||
index fde594f811..66537db3e3 100644
|
||||
--- a/src/conf/domain_conf.c
|
||||
+++ b/src/conf/domain_conf.c
|
||||
@@ -7220,7 +7220,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
|
||||
src->configFile = virXPathString("string(./config/@file)", ctxt);
|
||||
|
||||
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
|
||||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
|
||||
src->query = virXMLPropString(node, "query");
|
||||
|
||||
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
|
||||
@@ -30734,6 +30735,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_POOL_MPATH:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
|
||||
index 395e036e8f..8a0190f85b 100644
|
||||
--- a/src/conf/domain_validate.c
|
||||
+++ b/src/conf/domain_validate.c
|
||||
@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
}
|
||||
}
|
||||
|
||||
- /* internal snapshots and config files are currently supported only with rbd: */
|
||||
+ /* internal snapshots are currently supported only with rbd: */
|
||||
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
|
||||
if (src->snapshot) {
|
||||
@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
_("<snapshot> element is currently supported only with 'rbd' disks"));
|
||||
return -1;
|
||||
}
|
||||
+ }
|
||||
|
||||
+ /* config files are currently supported only with rbd and vitastor: */
|
||||
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
|
||||
if (src->configFile) {
|
||||
virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
- _("<config> element is currently supported only with 'rbd' disks"));
|
||||
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
|
||||
index a46a824f88..4c5b720643 100644
|
||||
--- a/src/conf/schemas/domaincommon.rng
|
||||
+++ b/src/conf/schemas/domaincommon.rng
|
||||
@@ -1997,6 +1997,35 @@
|
||||
</element>
|
||||
</define>
|
||||
|
||||
+ <define name="diskSourceNetworkProtocolVitastor">
|
||||
+ <element name="source">
|
||||
+ <interleave>
|
||||
+ <attribute name="protocol">
|
||||
+ <value>vitastor</value>
|
||||
+ </attribute>
|
||||
+ <ref name="diskSourceCommon"/>
|
||||
+ <optional>
|
||||
+ <attribute name="name"/>
|
||||
+ </optional>
|
||||
+ <optional>
|
||||
+ <attribute name="query"/>
|
||||
+ </optional>
|
||||
+ <zeroOrMore>
|
||||
+ <ref name="diskSourceNetworkHost"/>
|
||||
+ </zeroOrMore>
|
||||
+ <optional>
|
||||
+ <element name="config">
|
||||
+ <attribute name="file">
|
||||
+ <ref name="absFilePath"/>
|
||||
+ </attribute>
|
||||
+ <empty/>
|
||||
+ </element>
|
||||
+ </optional>
|
||||
+ <empty/>
|
||||
+ </interleave>
|
||||
+ </element>
|
||||
+ </define>
|
||||
+
|
||||
<define name="diskSourceNetworkProtocolISCSI">
|
||||
<element name="source">
|
||||
<attribute name="protocol">
|
||||
@@ -2347,6 +2376,7 @@
|
||||
<ref name="diskSourceNetworkProtocolSimple"/>
|
||||
<ref name="diskSourceNetworkProtocolVxHS"/>
|
||||
<ref name="diskSourceNetworkProtocolNFS"/>
|
||||
+ <ref name="diskSourceNetworkProtocolVitastor"/>
|
||||
</choice>
|
||||
</define>
|
||||
|
||||
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
|
||||
index 68842004b7..1d69a788b6 100644
|
||||
--- a/src/conf/storage_conf.c
|
||||
+++ b/src/conf/storage_conf.c
|
||||
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
|
||||
"logical", "disk", "iscsi",
|
||||
"iscsi-direct", "scsi", "mpath",
|
||||
"rbd", "sheepdog", "gluster",
|
||||
- "zfs", "vstorage",
|
||||
+ "zfs", "vstorage", "vitastor",
|
||||
);
|
||||
|
||||
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
|
||||
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
|
||||
.formatToString = virStorageFileFormatTypeToString,
|
||||
}
|
||||
},
|
||||
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
|
||||
+ .poolOptions = {
|
||||
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NAME),
|
||||
+ },
|
||||
+ .volOptions = {
|
||||
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
|
||||
+ .formatFromString = virStorageVolumeFormatFromString,
|
||||
+ .formatToString = virStorageFileFormatTypeToString,
|
||||
+ }
|
||||
+ },
|
||||
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
|
||||
.poolOptions = {
|
||||
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
|
||||
_("element 'name' is mandatory for RBD pool"));
|
||||
return -1;
|
||||
}
|
||||
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
|
||||
+ virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
+ _("element 'name' is mandatory for Vitastor pool"));
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
if (options->formatFromString) {
|
||||
g_autofree char *format = NULL;
|
||||
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
|
||||
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
|
||||
* files, so they don't have a target */
|
||||
if (def->type != VIR_STORAGE_POOL_RBD &&
|
||||
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
|
||||
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
|
||||
def->type != VIR_STORAGE_POOL_GLUSTER &&
|
||||
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
|
||||
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
|
||||
index fc67957cfe..720c07ef74 100644
|
||||
--- a/src/conf/storage_conf.h
|
||||
+++ b/src/conf/storage_conf.h
|
||||
@@ -103,6 +103,7 @@ typedef enum {
|
||||
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
|
||||
VIR_STORAGE_POOL_ZFS, /* ZFS */
|
||||
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
|
||||
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
|
||||
|
||||
VIR_STORAGE_POOL_LAST,
|
||||
} virStoragePoolType;
|
||||
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
|
||||
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
|
||||
index 959ec5ed40..e751dd4d6a 100644
|
||||
--- a/src/conf/storage_source_conf.c
|
||||
+++ b/src/conf/storage_source_conf.c
|
||||
@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
|
||||
"ssh",
|
||||
"vxhs",
|
||||
"nfs",
|
||||
+ "vitastor",
|
||||
);
|
||||
|
||||
|
||||
@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
return 24007;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
/* we don't provide a default for RBD */
|
||||
return 0;
|
||||
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
|
||||
index 05b4bda16c..b5ed143c39 100644
|
||||
--- a/src/conf/storage_source_conf.h
|
||||
+++ b/src/conf/storage_source_conf.h
|
||||
@@ -129,6 +129,7 @@ typedef enum {
|
||||
VIR_STORAGE_NET_PROTOCOL_SSH,
|
||||
VIR_STORAGE_NET_PROTOCOL_VXHS,
|
||||
VIR_STORAGE_NET_PROTOCOL_NFS,
|
||||
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
|
||||
|
||||
VIR_STORAGE_NET_PROTOCOL_LAST
|
||||
} virStorageNetProtocol;
|
||||
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
|
||||
index 59fa5da372..4739167f5f 100644
|
||||
--- a/src/conf/virstorageobj.c
|
||||
+++ b/src/conf/virstorageobj.c
|
||||
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
|
||||
return 1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
|
||||
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
|
||||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
|
||||
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
|
||||
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
|
||||
index db7660aac4..561df34709 100644
|
||||
--- a/src/libvirt-storage.c
|
||||
+++ b/src/libvirt-storage.c
|
||||
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
|
||||
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
|
||||
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
|
||||
index 62e1be6672..71a1d42896 100644
|
||||
--- a/src/libxl/libxl_conf.c
|
||||
+++ b/src/libxl/libxl_conf.c
|
||||
@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
|
||||
index 53f6871efc..c34b8cee1a 100644
|
||||
--- a/src/libxl/xen_xl.c
|
||||
+++ b/src/libxl/xen_xl.c
|
||||
@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
|
||||
index 738b72d7ea..5dd082fc89 100644
|
||||
--- a/src/qemu/qemu_block.c
|
||||
+++ b/src/qemu/qemu_block.c
|
||||
@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
|
||||
}
|
||||
|
||||
|
||||
+static virJSONValue *
|
||||
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
|
||||
+{
|
||||
+ virJSONValue *ret = NULL;
|
||||
+ virStorageNetHostDef *host;
|
||||
+ size_t i;
|
||||
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
|
||||
+ g_autofree char *etcd = NULL;
|
||||
+
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ host = src->hosts + i;
|
||||
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
|
||||
+ return NULL;
|
||||
+ }
|
||||
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
|
||||
+ }
|
||||
+ if (src->nhosts > 0) {
|
||||
+ etcd = virBufferContentAndReset(&buf);
|
||||
+ }
|
||||
+
|
||||
+ if (virJSONValueObjectAdd(&ret,
|
||||
+ "S:etcd-host", etcd,
|
||||
+ "S:etcd-prefix", src->query,
|
||||
+ "S:config-path", src->configFile,
|
||||
+ "s:image", src->path,
|
||||
+ NULL) < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static virJSONValue *
|
||||
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
|
||||
{
|
||||
@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return NULL;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
@@ -2020,6 +2058,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
@@ -2400,6 +2439,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
|
||||
index bda62f2e5c..84b4e5f2b8 100644
|
||||
--- a/src/qemu/qemu_domain.c
|
||||
+++ b/src/qemu/qemu_domain.c
|
||||
@@ -5260,7 +5260,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
|
||||
if (src->query &&
|
||||
(actualType != VIR_STORAGE_TYPE_NETWORK ||
|
||||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
|
||||
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
|
||||
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
||||
_("query is supported only with HTTP(S) protocols"));
|
||||
return -1;
|
||||
@@ -10514,6 +10515,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
|
||||
break;
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
|
||||
index f5260c4a22..2f9d8406fe 100644
|
||||
--- a/src/qemu/qemu_snapshot.c
|
||||
+++ b/src/qemu/qemu_snapshot.c
|
||||
@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
|
||||
index 86c03762d2..630c6eff1a 100644
|
||||
--- a/src/storage/storage_driver.c
|
||||
+++ b/src/storage/storage_driver.c
|
||||
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
|
||||
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
|
||||
index 80681924ea..8a3ade9ec0 100644
|
||||
--- a/src/storage_file/storage_source_backingstore.c
|
||||
+++ b/src/storage_file/storage_source_backingstore.c
|
||||
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
|
||||
}
|
||||
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseVitastorColonString(const char *colonstr,
|
||||
+ virStorageSource *src)
|
||||
+{
|
||||
+ char *p, *e, *next;
|
||||
+ g_autofree char *options = NULL;
|
||||
+
|
||||
+ /* optionally skip the "vitastor:" prefix if provided */
|
||||
+ if (STRPREFIX(colonstr, "vitastor:"))
|
||||
+ colonstr += strlen("vitastor:");
|
||||
+
|
||||
+ options = g_strdup(colonstr);
|
||||
+
|
||||
+ p = options;
|
||||
+ while (*p) {
|
||||
+ /* find : delimiter or end of string */
|
||||
+ for (e = p; *e && *e != ':'; ++e) {
|
||||
+ if (*e == '\\') {
|
||||
+ e++;
|
||||
+ if (*e == '\0')
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if (*e == '\0') {
|
||||
+ next = e; /* last kv pair */
|
||||
+ } else {
|
||||
+ next = e + 1;
|
||||
+ *e = '\0';
|
||||
+ }
|
||||
+
|
||||
+ if (STRPREFIX(p, "image=")) {
|
||||
+ src->path = g_strdup(p + strlen("image="));
|
||||
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
|
||||
+ src->query = g_strdup(p + strlen("etcd-prefix="));
|
||||
+ } else if (STRPREFIX(p, "config-path=")) {
|
||||
+ src->configFile = g_strdup(p + strlen("config-path="));
|
||||
+ } else if (STRPREFIX(p, "etcd-host=")) {
|
||||
+ char *h, *sep;
|
||||
+
|
||||
+ h = p + strlen("etcd-host=");
|
||||
+ while (h < e) {
|
||||
+ for (sep = h; sep < e; ++sep) {
|
||||
+ if (*sep == '\\' && (sep[1] == ',' ||
|
||||
+ sep[1] == ';' ||
|
||||
+ sep[1] == ' ')) {
|
||||
+ *sep = '\0';
|
||||
+ sep += 2;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (virStorageSourceRBDAddHost(src, h) < 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ h = sep;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p = next;
|
||||
+ }
|
||||
+
|
||||
+ if (!src->path) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseNBDColonString(const char *nbdstr,
|
||||
virStorageSource *src)
|
||||
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
|
||||
+ virJSONValue *json,
|
||||
+ const char *jsonstr G_GNUC_UNUSED,
|
||||
+ int opaque G_GNUC_UNUSED)
|
||||
+{
|
||||
+ const char *filename;
|
||||
+ const char *image = virJSONValueObjectGetString(json, "image");
|
||||
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
|
||||
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
|
||||
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
|
||||
+ size_t nservers;
|
||||
+ size_t i;
|
||||
+
|
||||
+ src->type = VIR_STORAGE_TYPE_NETWORK;
|
||||
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
|
||||
+
|
||||
+ /* legacy syntax passed via 'filename' option */
|
||||
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
|
||||
+ return virStorageSourceParseVitastorColonString(filename, src);
|
||||
+
|
||||
+ if (!image) {
|
||||
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
|
||||
+ _("missing image name in Vitastor backing volume "
|
||||
+ "JSON specification"));
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ src->path = g_strdup(image);
|
||||
+ src->configFile = g_strdup(conf);
|
||||
+ src->query = g_strdup(etcd_prefix);
|
||||
+
|
||||
+ if (servers) {
|
||||
+ nservers = virJSONValueArraySize(servers);
|
||||
+
|
||||
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
|
||||
+ src->nhosts = nservers;
|
||||
+
|
||||
+ for (i = 0; i < nservers; i++) {
|
||||
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
|
||||
+ virJSONValueArrayGet(servers, i)) < 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
|
||||
virJSONValue *json,
|
||||
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
|
||||
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
|
||||
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
|
||||
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
|
||||
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
|
||||
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
|
||||
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
|
||||
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
|
||||
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
|
||||
index d2d1bc43e3..31a92e4a01 100644
|
||||
--- a/src/test/test_driver.c
|
||||
+++ b/src/test/test_driver.c
|
||||
@@ -7339,6 +7339,7 @@ testStorageVolumeTypeForPool(int pooltype)
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
return VIR_STORAGE_VOL_NETWORK;
|
||||
case VIR_STORAGE_POOL_LOGICAL:
|
||||
case VIR_STORAGE_POOL_DISK:
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
index eee75af746..8bd0a57bdd 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='no'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
index 805950a937..852df0de16 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='yes'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
|
||||
index e8e40d695e..db55fe5f3a 100644
|
||||
--- a/tests/storagepoolxml2argvtest.c
|
||||
+++ b/tests/storagepoolxml2argvtest.c
|
||||
@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
default:
|
||||
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
|
||||
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
|
||||
index f9aad8ded0..64704b4288 100644
|
||||
--- a/tools/virsh-pool.c
|
||||
+++ b/tools/virsh-pool.c
|
||||
@@ -1187,6 +1187,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
|
||||
break;
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
|
||||
+ break;
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
}
|
288
patches/nova-28.diff
Normal file
288
patches/nova-28.diff
Normal file
@@ -0,0 +1,288 @@
|
||||
diff --git a/nova/virt/image/model.py b/nova/virt/image/model.py
|
||||
index 971f7e9c07..ec3fca72cb 100644
|
||||
--- a/nova/virt/image/model.py
|
||||
+++ b/nova/virt/image/model.py
|
||||
@@ -129,3 +129,22 @@ class RBDImage(Image):
|
||||
self.user = user
|
||||
self.password = password
|
||||
self.servers = servers
|
||||
+
|
||||
+
|
||||
+class VitastorImage(Image):
|
||||
+ """Class for images in a remote Vitastor cluster"""
|
||||
+
|
||||
+ def __init__(self, name, etcd_address = None, etcd_prefix = None, config_path = None):
|
||||
+ """Create a new Vitastor image object
|
||||
+
|
||||
+ :param name: name of the image
|
||||
+ :param etcd_address: etcd URL(s) (optional)
|
||||
+ :param etcd_prefix: etcd prefix (optional)
|
||||
+ :param config_path: path to the configuration (optional)
|
||||
+ """
|
||||
+ super(VitastorImage, self).__init__(FORMAT_RAW)
|
||||
+
|
||||
+ self.name = name
|
||||
+ self.etcd_address = etcd_address
|
||||
+ self.etcd_prefix = etcd_prefix
|
||||
+ self.config_path = config_path
|
||||
diff --git a/nova/virt/images.py b/nova/virt/images.py
|
||||
index 5358f3766a..ebe3d6effb 100644
|
||||
--- a/nova/virt/images.py
|
||||
+++ b/nova/virt/images.py
|
||||
@@ -41,7 +41,7 @@ IMAGE_API = glance.API()
|
||||
|
||||
def qemu_img_info(path, format=None):
|
||||
"""Return an object containing the parsed output from qemu-img info."""
|
||||
- if not os.path.exists(path) and not path.startswith('rbd:'):
|
||||
+ if not os.path.exists(path) and not path.startswith('rbd:') and not path.startswith('vitastor:'):
|
||||
raise exception.DiskNotFound(location=path)
|
||||
|
||||
info = nova.privsep.qemu.unprivileged_qemu_img_info(path, format=format)
|
||||
@@ -50,7 +50,7 @@ def qemu_img_info(path, format=None):
|
||||
|
||||
def privileged_qemu_img_info(path, format=None, output_format='json'):
|
||||
"""Return an object containing the parsed output from qemu-img info."""
|
||||
- if not os.path.exists(path) and not path.startswith('rbd:'):
|
||||
+ if not os.path.exists(path) and not path.startswith('rbd:') and not path.startswith('vitastor:'):
|
||||
raise exception.DiskNotFound(location=path)
|
||||
|
||||
info = nova.privsep.qemu.privileged_qemu_img_info(path, format=format)
|
||||
diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py
|
||||
index f9475776b3..a2e18aab67 100644
|
||||
--- a/nova/virt/libvirt/config.py
|
||||
+++ b/nova/virt/libvirt/config.py
|
||||
@@ -1060,6 +1060,8 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
|
||||
self.driver_iommu = False
|
||||
self.source_path = None
|
||||
self.source_protocol = None
|
||||
+ self.source_query = None
|
||||
+ self.source_config = None
|
||||
self.source_name = None
|
||||
self.source_hosts = []
|
||||
self.source_ports = []
|
||||
@@ -1189,6 +1191,10 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
|
||||
source = etree.Element("source", protocol=self.source_protocol)
|
||||
if self.source_name is not None:
|
||||
source.set('name', self.source_name)
|
||||
+ if self.source_query is not None:
|
||||
+ source.set('query', self.source_query)
|
||||
+ if self.source_config is not None:
|
||||
+ source.append(etree.Element('config', file=self.source_config))
|
||||
hosts_info = zip(self.source_hosts, self.source_ports)
|
||||
for name, port in hosts_info:
|
||||
host = etree.Element('host', name=name)
|
||||
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
|
||||
index 391231c527..f38faa1608 100644
|
||||
--- a/nova/virt/libvirt/driver.py
|
||||
+++ b/nova/virt/libvirt/driver.py
|
||||
@@ -179,6 +179,7 @@ VOLUME_DRIVERS = {
|
||||
'local': 'nova.virt.libvirt.volume.volume.LibvirtVolumeDriver',
|
||||
'fake': 'nova.virt.libvirt.volume.volume.LibvirtFakeVolumeDriver',
|
||||
'rbd': 'nova.virt.libvirt.volume.net.LibvirtNetVolumeDriver',
|
||||
+ 'vitastor': 'nova.virt.libvirt.volume.vitastor.LibvirtVitastorVolumeDriver',
|
||||
'nfs': 'nova.virt.libvirt.volume.nfs.LibvirtNFSVolumeDriver',
|
||||
'smbfs': 'nova.virt.libvirt.volume.smbfs.LibvirtSMBFSVolumeDriver',
|
||||
'fibre_channel': 'nova.virt.libvirt.volume.fibrechannel.LibvirtFibreChannelVolumeDriver', # noqa:E501
|
||||
@@ -385,10 +386,10 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# This prevents the risk of one test setting a capability
|
||||
# which bleeds over into other tests.
|
||||
|
||||
- # LVM and RBD require raw images. If we are not configured to
|
||||
+ # LVM, RBD, Vitastor require raw images. If we are not configured to
|
||||
# force convert images into raw format, then we _require_ raw
|
||||
# images only.
|
||||
- raw_only = ('rbd', 'lvm')
|
||||
+ raw_only = ('rbd', 'lvm', 'vitastor')
|
||||
requires_raw_image = (CONF.libvirt.images_type in raw_only and
|
||||
not CONF.force_raw_images)
|
||||
requires_ploop_image = CONF.libvirt.virt_type == 'parallels'
|
||||
@@ -775,12 +776,12 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# Some imagebackends are only able to import raw disk images,
|
||||
# and will fail if given any other format. See the bug
|
||||
# https://bugs.launchpad.net/nova/+bug/1816686 for more details.
|
||||
- if CONF.libvirt.images_type in ('rbd',):
|
||||
+ if CONF.libvirt.images_type in ('rbd', 'vitastor'):
|
||||
if not CONF.force_raw_images:
|
||||
msg = _("'[DEFAULT]/force_raw_images = False' is not "
|
||||
- "allowed with '[libvirt]/images_type = rbd'. "
|
||||
+ "allowed with '[libvirt]/images_type = rbd' or 'vitastor'. "
|
||||
"Please check the two configs and if you really "
|
||||
- "do want to use rbd as images_type, set "
|
||||
+ "do want to use rbd or vitastor as images_type, set "
|
||||
"force_raw_images to True.")
|
||||
raise exception.InvalidConfiguration(msg)
|
||||
|
||||
@@ -2603,6 +2604,16 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
if connection_info['data'].get('auth_enabled'):
|
||||
username = connection_info['data']['auth_username']
|
||||
path = f"rbd:{volume_name}:id={username}"
|
||||
+ elif connection_info['driver_volume_type'] == 'vitastor':
|
||||
+ volume_name = connection_info['data']['name']
|
||||
+ path = 'vitastor:image='+volume_name.replace(':', '\\:')
|
||||
+ for k in [ 'config_path', 'etcd_address', 'etcd_prefix' ]:
|
||||
+ if k in connection_info['data']:
|
||||
+ kk = k
|
||||
+ if kk == 'etcd_address':
|
||||
+ # FIXME use etcd_address in qemu driver
|
||||
+ kk = 'etcd_host'
|
||||
+ path += ":"+kk.replace('_', '-')+"="+connection_info['data'][k].replace(':', '\\:')
|
||||
else:
|
||||
path = 'unknown'
|
||||
raise exception.DiskNotFound(location='unknown')
|
||||
@@ -2827,8 +2838,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
|
||||
image_format = CONF.libvirt.snapshot_image_format or source_type
|
||||
|
||||
- # NOTE(bfilippov): save lvm and rbd as raw
|
||||
- if image_format == 'lvm' or image_format == 'rbd':
|
||||
+ # NOTE(bfilippov): save lvm and rbd and vitastor as raw
|
||||
+ if image_format == 'lvm' or image_format == 'rbd' or image_format == 'vitastor':
|
||||
image_format = 'raw'
|
||||
|
||||
metadata = self._create_snapshot_metadata(instance.image_meta,
|
||||
@@ -2899,7 +2910,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
expected_state=task_states.IMAGE_UPLOADING)
|
||||
|
||||
# TODO(nic): possibly abstract this out to the root_disk
|
||||
- if source_type == 'rbd' and live_snapshot:
|
||||
+ if (source_type == 'rbd' or source_type == 'vitastor') and live_snapshot:
|
||||
# Standard snapshot uses qemu-img convert from RBD which is
|
||||
# not safe to run with live_snapshot.
|
||||
live_snapshot = False
|
||||
@@ -4099,7 +4110,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# cleanup rescue volume
|
||||
lvm.remove_volumes([lvmdisk for lvmdisk in self._lvm_disks(instance)
|
||||
if lvmdisk.endswith('.rescue')])
|
||||
- if CONF.libvirt.images_type == 'rbd':
|
||||
+ if CONF.libvirt.images_type == 'rbd' or CONF.libvirt.images_type == 'vitastor':
|
||||
filter_fn = lambda disk: (disk.startswith(instance.uuid) and
|
||||
disk.endswith('.rescue'))
|
||||
rbd_utils.RBDDriver().cleanup_volumes(filter_fn)
|
||||
@@ -4356,6 +4367,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# TODO(mikal): there is a bug here if images_type has
|
||||
# changed since creation of the instance, but I am pretty
|
||||
# sure that this bug already exists.
|
||||
+ if CONF.libvirt.images_type == 'vitastor':
|
||||
+ return 'vitastor'
|
||||
return 'rbd' if CONF.libvirt.images_type == 'rbd' else 'raw'
|
||||
|
||||
@staticmethod
|
||||
@@ -4764,10 +4777,10 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
finally:
|
||||
# NOTE(mikal): if the config drive was imported into RBD,
|
||||
# then we no longer need the local copy
|
||||
- if CONF.libvirt.images_type == 'rbd':
|
||||
+ if CONF.libvirt.images_type == 'rbd' or CONF.libvirt.images_type == 'vitastor':
|
||||
LOG.info('Deleting local config drive %(path)s '
|
||||
- 'because it was imported into RBD.',
|
||||
- {'path': config_disk_local_path},
|
||||
+ 'because it was imported into %(type).',
|
||||
+ {'path': config_disk_local_path, 'type': CONF.libvirt.images_type},
|
||||
instance=instance)
|
||||
os.unlink(config_disk_local_path)
|
||||
|
||||
diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py
|
||||
index da2a6e8b8a..52c02e72f1 100644
|
||||
--- a/nova/virt/libvirt/utils.py
|
||||
+++ b/nova/virt/libvirt/utils.py
|
||||
@@ -340,6 +340,10 @@ def find_disk(guest: libvirt_guest.Guest) -> ty.Tuple[str, ty.Optional[str]]:
|
||||
disk_path = disk.source_name
|
||||
if disk_path:
|
||||
disk_path = 'rbd:' + disk_path
|
||||
+ elif not disk_path and disk.source_protocol == 'vitastor':
|
||||
+ disk_path = disk.source_name
|
||||
+ if disk_path:
|
||||
+ disk_path = 'vitastor:' + disk_path
|
||||
|
||||
if not disk_path:
|
||||
raise RuntimeError(_("Can't retrieve root device path "
|
||||
@@ -354,6 +358,8 @@ def get_disk_type_from_path(path: str) -> ty.Optional[str]:
|
||||
return 'lvm'
|
||||
elif path.startswith('rbd:'):
|
||||
return 'rbd'
|
||||
+ elif path.startswith('vitastor:'):
|
||||
+ return 'vitastor'
|
||||
elif (os.path.isdir(path) and
|
||||
os.path.exists(os.path.join(path, "DiskDescriptor.xml"))):
|
||||
return 'ploop'
|
||||
diff --git a/nova/virt/libvirt/volume/vitastor.py b/nova/virt/libvirt/volume/vitastor.py
|
||||
new file mode 100644
|
||||
index 0000000000..0256df62c1
|
||||
--- /dev/null
|
||||
+++ b/nova/virt/libvirt/volume/vitastor.py
|
||||
@@ -0,0 +1,75 @@
|
||||
+# Copyright (c) 2021+, Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
+#
|
||||
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
+# not use this file except in compliance with the License. You may obtain
|
||||
+# a copy of the License at
|
||||
+#
|
||||
+# http://www.apache.org/licenses/LICENSE-2.0
|
||||
+#
|
||||
+# Unless required by applicable law or agreed to in writing, software
|
||||
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
+# License for the specific language governing permissions and limitations
|
||||
+# under the License.
|
||||
+
|
||||
+from os_brick import exception as os_brick_exception
|
||||
+from os_brick import initiator
|
||||
+from os_brick.initiator import connector
|
||||
+from oslo_log import log as logging
|
||||
+
|
||||
+import nova.conf
|
||||
+from nova import utils
|
||||
+from nova.virt.libvirt.volume import volume as libvirt_volume
|
||||
+
|
||||
+
|
||||
+CONF = nova.conf.CONF
|
||||
+LOG = logging.getLogger(__name__)
|
||||
+
|
||||
+
|
||||
+class LibvirtVitastorVolumeDriver(libvirt_volume.LibvirtBaseVolumeDriver):
|
||||
+ """Driver to attach Vitastor volumes to libvirt."""
|
||||
+ def __init__(self, host):
|
||||
+ super(LibvirtVitastorVolumeDriver, self).__init__(host, is_block_dev=False)
|
||||
+
|
||||
+ def connect_volume(self, connection_info, instance):
|
||||
+ pass
|
||||
+
|
||||
+ def disconnect_volume(self, connection_info, instance, force=False):
|
||||
+ pass
|
||||
+
|
||||
+ def get_config(self, connection_info, disk_info):
|
||||
+ """Returns xml for libvirt."""
|
||||
+ conf = super(LibvirtVitastorVolumeDriver, self).get_config(connection_info, disk_info)
|
||||
+ conf.source_type = 'network'
|
||||
+ conf.source_protocol = 'vitastor'
|
||||
+ conf.source_name = connection_info['data'].get('name')
|
||||
+ conf.source_query = connection_info['data'].get('etcd_prefix') or None
|
||||
+ conf.source_config = connection_info['data'].get('config_path') or None
|
||||
+ conf.source_hosts = []
|
||||
+ conf.source_ports = []
|
||||
+ addresses = connection_info['data'].get('etcd_address', '')
|
||||
+ if addresses:
|
||||
+ if not isinstance(addresses, list):
|
||||
+ addresses = addresses.split(',')
|
||||
+ for addr in addresses:
|
||||
+ if addr.startswith('https://'):
|
||||
+ raise NotImplementedError('Vitastor block driver does not support SSL for etcd communication yet')
|
||||
+ if addr.startswith('http://'):
|
||||
+ addr = addr[7:]
|
||||
+ addr = addr.rstrip('/')
|
||||
+ if addr.endswith('/v3'):
|
||||
+ addr = addr[0:-3]
|
||||
+ p = addr.find('/')
|
||||
+ if p > 0:
|
||||
+ raise NotImplementedError('libvirt does not support custom URL paths for Vitastor etcd yet. Use /etc/vitastor/vitastor.conf')
|
||||
+ p = addr.find(':')
|
||||
+ port = '2379'
|
||||
+ if p > 0:
|
||||
+ port = addr[p+1:]
|
||||
+ addr = addr[0:p]
|
||||
+ conf.source_hosts.append(addr)
|
||||
+ conf.source_ports.append(port)
|
||||
+ return conf
|
||||
+
|
||||
+ def extend_volume(self, connection_info, instance, requested_size):
|
||||
+ return requested_size
|
190
patches/qemu-8.2-vitastor.patch
Normal file
190
patches/qemu-8.2-vitastor.patch
Normal file
@@ -0,0 +1,190 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index 59ff6d380c..abde3715c2 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -109,6 +109,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 6c77d9687d..390683ee71 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -1295,6 +1295,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'))
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -2157,6 +2177,7 @@ endif
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||
@@ -4356,6 +4377,7 @@ summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt}
|
||||
summary_info += {'libcap-ng support': libcap_ng}
|
||||
summary_info += {'bpf support': libbpf}
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index c9baeda639..85e1df5a56 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index ca390c5700..8f11ae9fa5 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -3201,7 +3201,7 @@
|
||||
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4255,6 +4255,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4713,6 +4735,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5148,6 +5171,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5370,6 +5404,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
index 76781f17f4..ac5fe3aa08 100755
|
||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -30,7 +30,7 @@
|
||||
--with-suffix="qemu-kvm" \
|
||||
--firmwarepath=/usr/share/qemu-firmware \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -176,6 +176,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 680fa3f581..dab422bf04 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -168,6 +168,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
|
||||
@@ -445,6 +446,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
190
patches/qemu-9.0-vitastor.patch
Normal file
190
patches/qemu-9.0-vitastor.patch
Normal file
@@ -0,0 +1,190 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index e1f03fd773..db0cfb2321 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -114,6 +114,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 91a0aa64c6..e8bc710578 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -1452,6 +1452,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'))
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -2250,6 +2270,7 @@ endif
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||
@@ -4443,6 +4464,7 @@ summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt}
|
||||
summary_info += {'libcap-ng support': libcap_ng}
|
||||
summary_info += {'bpf support': libbpf}
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index 0a99a059ec..16dc440118 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index 746d1694c2..fb7aa4423b 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -3203,7 +3203,7 @@
|
||||
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4285,6 +4285,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4741,6 +4763,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5180,6 +5203,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5402,6 +5436,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
index 76781f17f4..ac5fe3aa08 100755
|
||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -30,7 +30,7 @@
|
||||
--with-suffix="qemu-kvm" \
|
||||
--firmwarepath=/usr/share/qemu-firmware \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -176,6 +176,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 680fa3f581..dab422bf04 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -168,6 +168,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
|
||||
@@ -445,6 +446,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
@@ -108,10 +108,11 @@ npm install --production
|
||||
cd ..
|
||||
mkdir -p %buildroot/usr/lib/vitastor
|
||||
cp -r mon %buildroot/usr/lib/vitastor
|
||||
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
|
||||
mkdir -p %buildroot/lib/systemd/system
|
||||
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||
mkdir -p %buildroot/lib/udev/rules.d
|
||||
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||
|
||||
|
||||
%files
|
||||
|
@@ -105,10 +105,11 @@ npm install --production
|
||||
cd ..
|
||||
mkdir -p %buildroot/usr/lib/vitastor
|
||||
cp -r mon %buildroot/usr/lib/vitastor
|
||||
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
|
||||
mkdir -p %buildroot/lib/systemd/system
|
||||
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||
mkdir -p %buildroot/lib/udev/rules.d
|
||||
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||
|
||||
|
||||
%files
|
||||
|
@@ -98,10 +98,11 @@ npm install --production
|
||||
cd ..
|
||||
mkdir -p %buildroot/usr/lib/vitastor
|
||||
cp -r mon %buildroot/usr/lib/vitastor
|
||||
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
|
||||
mkdir -p %buildroot/lib/systemd/system
|
||||
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||
mkdir -p %buildroot/lib/udev/rules.d
|
||||
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||
|
||||
|
||||
%files
|
||||
|
@@ -366,6 +366,7 @@ resume_0:
|
||||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
{
|
||||
stop_flusher:
|
||||
flusher->dequeuing = false;
|
||||
if (flusher->trim_wanted > 0 && try_trim)
|
||||
{
|
||||
// Attempt forced trim
|
||||
@@ -373,7 +374,6 @@ stop_flusher:
|
||||
flusher->active_flushers++;
|
||||
goto trim_journal;
|
||||
}
|
||||
flusher->dequeuing = false;
|
||||
wait_state = 0;
|
||||
return true;
|
||||
}
|
||||
|
@@ -34,7 +34,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
{
|
||||
// peer_osd just dropped connection
|
||||
// determine WHICH dirty_buffers are now obsolete and repeat them
|
||||
if (wb->repeat_ops_for(this, peer_osd) > 0)
|
||||
if (wb->repeat_ops_for(this, peer_osd, 0, 0) > 0)
|
||||
{
|
||||
continue_ops();
|
||||
}
|
||||
@@ -52,7 +52,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
st_cli.tfd = tfd;
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
|
||||
st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_hook(changes); };
|
||||
st_cli.on_change_pool_config_hook = [this]() { on_change_pool_config_hook(); };
|
||||
st_cli.on_change_pg_state_hook = [this](pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary) { on_change_pg_state_hook(pool_id, pg_num, prev_primary); };
|
||||
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
|
||||
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
|
||||
|
||||
@@ -77,11 +78,6 @@ cluster_client_t::~cluster_client_t()
|
||||
|
||||
cluster_op_t::~cluster_op_t()
|
||||
{
|
||||
if (buf)
|
||||
{
|
||||
free(buf);
|
||||
buf = NULL;
|
||||
}
|
||||
if (bitmap_buf)
|
||||
{
|
||||
free(bitmap_buf);
|
||||
@@ -427,7 +423,7 @@ void cluster_client_t::on_load_pgs_hook(bool success)
|
||||
continue_ops();
|
||||
}
|
||||
|
||||
void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes)
|
||||
void cluster_client_t::on_change_pool_config_hook()
|
||||
{
|
||||
for (auto pool_item: st_cli.pool_config)
|
||||
{
|
||||
@@ -450,6 +446,19 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
continue_ops();
|
||||
}
|
||||
|
||||
void cluster_client_t::on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary)
|
||||
{
|
||||
auto & pg_cfg = st_cli.pool_config[pool_id].pg_config[pg_num];
|
||||
if (pg_cfg.cur_primary != prev_primary)
|
||||
{
|
||||
// Repeat this PG operations because an OSD which stopped being primary may not fsync operations
|
||||
if (wb->repeat_ops_for(this, 0, pool_id, pg_num) > 0)
|
||||
{
|
||||
continue_ops();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool cluster_client_t::get_immediate_commit(uint64_t inode)
|
||||
{
|
||||
if (enable_writeback)
|
||||
@@ -570,6 +579,14 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
{
|
||||
op->cur_inode = op->inode;
|
||||
op->retval = 0;
|
||||
op->state = 0;
|
||||
op->retry_after = 0;
|
||||
op->inflight_count = 0;
|
||||
op->done_count = 0;
|
||||
op->part_bitmaps = NULL;
|
||||
op->bitmap_buf_size = 0;
|
||||
op->prev_wait = 0;
|
||||
assert(!op->prev && !op->next);
|
||||
// check alignment, readonly flag and so on
|
||||
if (!check_rw(op))
|
||||
{
|
||||
@@ -600,7 +617,9 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
{
|
||||
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
|
||||
{
|
||||
wb->copy_write(op, CACHE_WRITTEN);
|
||||
uint64_t flush_id = ++wb->last_flush_id;
|
||||
wb->copy_write(op, CACHE_REPEATING, flush_id);
|
||||
op->flush_id = flush_id;
|
||||
}
|
||||
if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
|
||||
{
|
||||
@@ -816,6 +835,10 @@ resume_2:
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
||||
op->retval = op->len / pool_cfg.bitmap_granularity;
|
||||
}
|
||||
if (op->flush_id)
|
||||
{
|
||||
wb->mark_flush_written(op->inode, op->offset, op->len, op->flush_id);
|
||||
}
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
@@ -988,6 +1011,29 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
}
|
||||
}
|
||||
|
||||
bool cluster_client_t::affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num)
|
||||
{
|
||||
if (INODE_POOL(inode) != pool_id)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(inode));
|
||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
uint64_t pg_block_size = pool_cfg.data_block_size * pg_data_size;
|
||||
uint64_t first_stripe = (offset / pg_block_size) * pg_block_size;
|
||||
uint64_t last_stripe = len > 0 ? ((offset + len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||
if ((last_stripe/pool_cfg.pg_stripe_size) - (first_stripe/pool_cfg.pg_stripe_size) + 1 >= pool_cfg.real_pg_count)
|
||||
{
|
||||
// All PGs are affected
|
||||
return true;
|
||||
}
|
||||
pg_num_t first_pg_num = (first_stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
|
||||
pg_num_t last_pg_num = (last_stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
|
||||
return (first_pg_num <= last_pg_num
|
||||
? (pg_num >= first_pg_num && pg_num <= last_pg_num)
|
||||
: (pg_num >= first_pg_num || pg_num <= last_pg_num));
|
||||
}
|
||||
|
||||
bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(inode));
|
||||
@@ -1210,7 +1256,9 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
|
||||
// FIXME postpone such things to set_immediate here to avoid bugs
|
||||
// Set op->retry_after to retry operation after a short pause (not immediately)
|
||||
if (!op->retry_after)
|
||||
if (!op->retry_after && (op->retval == -EPIPE ||
|
||||
op->retval == -EIO && client_eio_retry_interval ||
|
||||
op->retval == -ENOSPC && client_retry_enospc))
|
||||
{
|
||||
op->retry_after = op->retval != -EPIPE ? client_eio_retry_interval : client_retry_interval;
|
||||
}
|
||||
|
@@ -56,8 +56,6 @@ struct cluster_op_t
|
||||
protected:
|
||||
int state = 0;
|
||||
uint64_t cur_inode; // for snapshot reads
|
||||
void *buf = NULL;
|
||||
cluster_op_t *orig_op = NULL;
|
||||
bool needs_reslice = false;
|
||||
int retry_after = 0;
|
||||
int inflight_count = 0, done_count = 0;
|
||||
@@ -66,6 +64,7 @@ protected:
|
||||
unsigned bitmap_buf_size = 0;
|
||||
cluster_op_t *prev = NULL, *next = NULL;
|
||||
int prev_wait = 0;
|
||||
uint64_t flush_id = 0;
|
||||
friend class cluster_client_t;
|
||||
friend class writeback_cache_t;
|
||||
};
|
||||
@@ -81,6 +80,7 @@ class cluster_client_t
|
||||
ring_loop_t *ringloop;
|
||||
|
||||
std::map<pool_id_t, uint64_t> pg_counts;
|
||||
std::map<pool_pg_num_t, osd_num_t> pg_primary;
|
||||
// client_max_dirty_* is actually "max unsynced", for the case when immediate_commit is off
|
||||
uint64_t client_max_dirty_bytes = 0;
|
||||
uint64_t client_max_dirty_ops = 0;
|
||||
@@ -146,9 +146,11 @@ public:
|
||||
|
||||
protected:
|
||||
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
||||
bool affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num);
|
||||
void on_load_config_hook(json11::Json::object & config);
|
||||
void on_load_pgs_hook(bool success);
|
||||
void on_change_hook(std::map<std::string, etcd_kv_t> & changes);
|
||||
void on_change_pool_config_hook();
|
||||
void on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary);
|
||||
void on_change_osd_state_hook(uint64_t peer_osd);
|
||||
void execute_internal(cluster_op_t *op);
|
||||
void unshift_op(cluster_op_t *op);
|
||||
|
@@ -46,11 +46,12 @@ public:
|
||||
bool is_left_merged(dirty_buf_it_t dirty_it);
|
||||
bool is_right_merged(dirty_buf_it_t dirty_it);
|
||||
bool is_merged(const dirty_buf_it_t & dirty_it);
|
||||
void copy_write(cluster_op_t *op, int state);
|
||||
int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd);
|
||||
void copy_write(cluster_op_t *op, int state, uint64_t new_flush_id = 0);
|
||||
int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd, pool_id_t pool_id, pg_num_t pg_num);
|
||||
void start_writebacks(cluster_client_t *cli, int count);
|
||||
bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
|
||||
void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
|
||||
void mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
|
||||
void fsync_start();
|
||||
void fsync_error();
|
||||
void fsync_ok();
|
||||
|
@@ -71,7 +71,7 @@ bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
|
||||
return is_left_merged(dirty_it) || is_right_merged(dirty_it);
|
||||
}
|
||||
|
||||
void writeback_cache_t::copy_write(cluster_op_t *op, int state)
|
||||
void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flush_id)
|
||||
{
|
||||
// Save operation for replay when one of PGs goes out of sync
|
||||
// (primary OSD drops our connection in this case)
|
||||
@@ -180,6 +180,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state)
|
||||
.buf = buf,
|
||||
.len = op->len,
|
||||
.state = state,
|
||||
.flush_id = new_flush_id,
|
||||
.refcnt = refcnt,
|
||||
});
|
||||
if (state == CACHE_DIRTY)
|
||||
@@ -208,7 +209,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state)
|
||||
}
|
||||
}
|
||||
|
||||
int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
|
||||
int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd, pool_id_t pool_id, pg_num_t pg_num)
|
||||
{
|
||||
int repeated = 0;
|
||||
if (dirty_buffers.size())
|
||||
@@ -218,8 +219,11 @@ int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
|
||||
for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
|
||||
{
|
||||
bool end = wr_it == dirty_buffers.end();
|
||||
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
|
||||
cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
|
||||
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING;
|
||||
if (peer_osd)
|
||||
flush_this = flush_this && cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
|
||||
if (pool_id && pg_num)
|
||||
flush_this = flush_this && cli->affects_pg(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, pool_id, pg_num);
|
||||
if (flush_it != wr_it && (end || !flush_this ||
|
||||
wr_it->first.inode != flush_it->first.inode ||
|
||||
wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
|
||||
@@ -265,7 +269,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||
writebacks_active++;
|
||||
op->callback = [this, flush_id](cluster_op_t* op)
|
||||
{
|
||||
// Buffer flushes should be always retried, regardless of the error,
|
||||
// Buffer flushes are always retried, regardless of the error,
|
||||
// so they should never result in an error here
|
||||
assert(op->retval == op->len);
|
||||
for (auto fl_it = flushed_buffers.find(flush_id);
|
||||
@@ -277,16 +281,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||
}
|
||||
flushed_buffers.erase(fl_it++);
|
||||
}
|
||||
for (auto dirty_it = find_dirty(op->inode, op->offset);
|
||||
dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
|
||||
dirty_it->first.stripe < op->offset+op->len; dirty_it++)
|
||||
{
|
||||
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
|
||||
{
|
||||
dirty_it->second.flush_id = 0;
|
||||
dirty_it->second.state = CACHE_WRITTEN;
|
||||
}
|
||||
}
|
||||
mark_flush_written(op->inode, op->offset, op->len, flush_id);
|
||||
delete op;
|
||||
writebacks_active--;
|
||||
// We can't call execute_internal because it affects an invalid copy of the list here
|
||||
@@ -304,6 +299,20 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||
}
|
||||
}
|
||||
|
||||
void writeback_cache_t::mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
|
||||
{
|
||||
for (auto dirty_it = find_dirty(inode, offset);
|
||||
dirty_it != dirty_buffers.end() && dirty_it->first.inode == inode &&
|
||||
dirty_it->first.stripe < offset+len; dirty_it++)
|
||||
{
|
||||
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
|
||||
{
|
||||
dirty_it->second.flush_id = 0;
|
||||
dirty_it->second.state = CACHE_WRITTEN;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
|
||||
{
|
||||
if (!writeback_queue.size())
|
||||
|
@@ -253,7 +253,7 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
||||
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
|
||||
if (this->etcd_ws_keepalive_interval <= 0)
|
||||
{
|
||||
this->etcd_ws_keepalive_interval = 30;
|
||||
this->etcd_ws_keepalive_interval = 5;
|
||||
}
|
||||
this->max_etcd_attempts = config["max_etcd_attempts"].uint64_value();
|
||||
if (this->max_etcd_attempts <= 0)
|
||||
@@ -890,6 +890,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
}
|
||||
}
|
||||
if (on_change_pool_config_hook)
|
||||
{
|
||||
on_change_pool_config_hook();
|
||||
}
|
||||
}
|
||||
else if (key == etcd_prefix+"/config/pgs")
|
||||
{
|
||||
@@ -1028,13 +1032,19 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
else if (value.is_null())
|
||||
{
|
||||
auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
|
||||
auto prev_primary = pg_cfg.cur_primary;
|
||||
pg_cfg.state_exists = false;
|
||||
pg_cfg.cur_primary = 0;
|
||||
pg_cfg.cur_state = 0;
|
||||
if (on_change_pg_state_hook)
|
||||
{
|
||||
on_change_pg_state_hook(pool_id, pg_num, prev_primary);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
|
||||
auto prev_primary = pg_cfg.cur_primary;
|
||||
pg_cfg.state_exists = true;
|
||||
osd_num_t cur_primary = value["primary"].uint64_value();
|
||||
int state = 0;
|
||||
@@ -1065,6 +1075,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
pg_cfg.cur_primary = cur_primary;
|
||||
pg_cfg.cur_state = state;
|
||||
if (on_change_pg_state_hook)
|
||||
{
|
||||
on_change_pg_state_hook(pool_id, pg_num, prev_primary);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (key.substr(0, etcd_prefix.length()+11) == etcd_prefix+"/osd/state/")
|
||||
|
@@ -103,7 +103,7 @@ protected:
|
||||
void pick_next_etcd();
|
||||
public:
|
||||
int etcd_keepalive_timeout = 30;
|
||||
int etcd_ws_keepalive_interval = 30;
|
||||
int etcd_ws_keepalive_interval = 5;
|
||||
int max_etcd_attempts = 5;
|
||||
int etcd_quick_timeout = 1000;
|
||||
int etcd_slow_timeout = 5000;
|
||||
@@ -127,6 +127,8 @@ public:
|
||||
std::function<void(json11::Json::object &)> on_load_config_hook;
|
||||
std::function<json11::Json()> load_pgs_checks_hook;
|
||||
std::function<void(bool)> on_load_pgs_hook;
|
||||
std::function<void()> on_change_pool_config_hook;
|
||||
std::function<void(pool_id_t, pg_num_t, osd_num_t)> on_change_pg_state_hook;
|
||||
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
|
||||
std::function<void(osd_num_t)> on_change_osd_state_hook;
|
||||
std::function<void()> on_reload_hook;
|
||||
|
@@ -271,7 +271,7 @@ void http_co_t::close_connection()
|
||||
}
|
||||
if (peer_fd >= 0)
|
||||
{
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(peer_fd, 0, NULL);
|
||||
close(peer_fd);
|
||||
peer_fd = -1;
|
||||
}
|
||||
@@ -314,7 +314,7 @@ void http_co_t::start_connection()
|
||||
stackout();
|
||||
return;
|
||||
}
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
this->epoll_events |= epoll_events;
|
||||
handle_events();
|
||||
@@ -372,7 +372,7 @@ void http_co_t::handle_connect_result()
|
||||
}
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
this->epoll_events |= epoll_events;
|
||||
handle_events();
|
||||
|
@@ -15,6 +15,207 @@
|
||||
#include "msgr_rdma.h"
|
||||
#endif
|
||||
|
||||
#include <sys/poll.h>
|
||||
#include <sys/eventfd.h>
|
||||
|
||||
static uint64_t one = 1;
|
||||
|
||||
msgr_iothread_t::msgr_iothread_t()
|
||||
{
|
||||
ring = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epmgr = new epoll_manager_t(ring);
|
||||
submit_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
|
||||
if (submit_eventfd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("failed to create eventfd: ")+strerror(errno));
|
||||
}
|
||||
epmgr->tfd->set_fd_handler(submit_eventfd, EPOLLIN, [this](int fd, int epoll_events)
|
||||
{
|
||||
// Reset eventfd counter
|
||||
uint64_t ctr = 0;
|
||||
int r = read(submit_eventfd, &ctr, 8);
|
||||
if (r < 0 && errno != EAGAIN && errno != EINTR)
|
||||
{
|
||||
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
|
||||
}
|
||||
ring->wakeup();
|
||||
});
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
read_requests();
|
||||
send_replies();
|
||||
ring->submit();
|
||||
};
|
||||
ring->register_consumer(&consumer);
|
||||
thread = new std::thread(&msgr_iothread_t::run, this);
|
||||
}
|
||||
|
||||
msgr_iothread_t::~msgr_iothread_t()
|
||||
{
|
||||
stop();
|
||||
delete thread;
|
||||
delete epmgr;
|
||||
delete ring;
|
||||
}
|
||||
|
||||
void msgr_iothread_t::stop()
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
stopped = true;
|
||||
write(submit_eventfd, &one, sizeof(one));
|
||||
mu.unlock();
|
||||
thread->join();
|
||||
ring->unregister_consumer(&consumer);
|
||||
close(submit_eventfd);
|
||||
}
|
||||
|
||||
void msgr_iothread_t::add_client(osd_client_t *cl)
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
assert(!clients[cl->peer_fd]);
|
||||
clients[cl->peer_fd] = cl;
|
||||
epmgr->tfd->set_fd_handler(cl->peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// FIXME: Slight copypaste (see handle_peer_epoll)
|
||||
if (epoll_events & EPOLLIN)
|
||||
{
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end())
|
||||
{
|
||||
auto cl = cl_it->second;
|
||||
cl->mu.lock();
|
||||
cl->read_ready++;
|
||||
if (cl->read_ready == 1)
|
||||
{
|
||||
read_ready_clients.push_back(peer_fd);
|
||||
ring->wakeup();
|
||||
}
|
||||
cl->mu.unlock();
|
||||
}
|
||||
}
|
||||
});
|
||||
mu.unlock();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::remove_client(osd_client_t *cl)
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
auto cl_it = clients.find(cl->peer_fd);
|
||||
if (cl_it != clients.end() && cl_it->second == cl)
|
||||
{
|
||||
clients.erase(cl->peer_fd);
|
||||
epmgr->tfd->set_fd_handler(cl->peer_fd, 0, NULL);
|
||||
}
|
||||
mu.unlock();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::wakeup_out(int peer_fd, ring_loop_t *outer_ring)
|
||||
{
|
||||
write_ready_mu.lock();
|
||||
if (!write_ready_clients.size())
|
||||
{
|
||||
io_uring_sqe* sqe = outer_ring->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
write(submit_eventfd, &one, sizeof(one));
|
||||
}
|
||||
else
|
||||
{
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [](ring_data_t*){};
|
||||
my_uring_prep_write(sqe, submit_eventfd, &one, sizeof(one), 0);
|
||||
}
|
||||
}
|
||||
write_ready_clients.push_back(peer_fd);
|
||||
write_ready_mu.unlock();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::read_requests()
|
||||
{
|
||||
// FIXME: Slight copypaste (see messenger_t::read_requests)
|
||||
auto to_recv = std::move(read_ready_clients);
|
||||
for (int i = 0; i < to_recv.size(); i++)
|
||||
{
|
||||
int peer_fd = to_recv[i];
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
osd_client_t *cl = cl_it->second;
|
||||
cl->mu.lock();
|
||||
auto ok = cl->try_recv(ring, false);
|
||||
cl->mu.unlock();
|
||||
if (!ok)
|
||||
{
|
||||
read_ready_clients.insert(read_ready_clients.end(), to_recv.begin()+i, to_recv.end());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::send_replies()
|
||||
{
|
||||
if (stopped)
|
||||
{
|
||||
return;
|
||||
}
|
||||
write_ready_mu.lock();
|
||||
auto to_send = std::move(write_ready_clients);
|
||||
write_ready_mu.unlock();
|
||||
for (int i = 0; i < to_send.size(); i++)
|
||||
{
|
||||
auto cl_it = clients.find(to_send[i]);
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto cl = cl_it->second;
|
||||
cl->mu.lock();
|
||||
auto ok = cl->try_send(ring, false/*, lock*/);
|
||||
cl->mu.unlock();
|
||||
if (!ok)
|
||||
{
|
||||
// ring is full (rare but what if...)
|
||||
write_ready_mu.lock();
|
||||
write_ready_clients.insert(write_ready_clients.end(), to_send.begin()+i, to_send.end());
|
||||
write_ready_mu.unlock();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::run()
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
ring->loop();
|
||||
mu.unlock();
|
||||
ring->wait();
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::init()
|
||||
{
|
||||
#ifdef WITH_RDMA
|
||||
@@ -35,7 +236,7 @@ void osd_messenger_t::init()
|
||||
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
||||
fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
|
||||
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
||||
tfd->set_fd_handler(rdma_context->channel->fd, EPOLLIN, [this](int notify_fd, int epoll_events)
|
||||
{
|
||||
handle_rdma_events();
|
||||
});
|
||||
@@ -43,6 +244,44 @@ void osd_messenger_t::init()
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (ringloop && iothread_count > 0)
|
||||
{
|
||||
for (int i = 0; i < iothread_count; i++)
|
||||
{
|
||||
auto iot = new msgr_iothread_t();
|
||||
iothreads.push_back(iot);
|
||||
}
|
||||
immediates_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
|
||||
if (immediates_eventfd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("failed to create set_immediate eventfd: ")+strerror(errno));
|
||||
}
|
||||
tfd->set_fd_handler(immediates_eventfd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Reset eventfd counter
|
||||
uint64_t ctr = 0;
|
||||
int r = read(immediates_eventfd, &ctr, 8);
|
||||
if (r < 0 && errno != EAGAIN && errno != EINTR)
|
||||
{
|
||||
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
immediates_mu.lock();
|
||||
auto to_run = std::move(immediates);
|
||||
immediates_mu.unlock();
|
||||
if (!to_run.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
for (auto & cb: to_run)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
}
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||
{
|
||||
auto cl_it = clients.begin();
|
||||
@@ -120,6 +359,12 @@ void osd_messenger_t::init()
|
||||
|
||||
osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
if (immediates_eventfd >= 0)
|
||||
{
|
||||
tfd->set_fd_handler(immediates_eventfd, 0, NULL);
|
||||
close(immediates_eventfd);
|
||||
immediates_eventfd = -1;
|
||||
}
|
||||
if (keepalive_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(keepalive_timer_id);
|
||||
@@ -129,6 +374,14 @@ osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
stop_client(clients.begin()->first, true, true);
|
||||
}
|
||||
if (iothreads.size())
|
||||
{
|
||||
for (auto iot: iothreads)
|
||||
{
|
||||
delete iot;
|
||||
}
|
||||
iothreads.clear();
|
||||
}
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_context)
|
||||
{
|
||||
@@ -165,6 +418,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
this->rdma_max_msg = 129*1024;
|
||||
this->rdma_odp = config["rdma_odp"].bool_value();
|
||||
#endif
|
||||
if (!osd_num)
|
||||
this->iothread_count = config["client_iothread_count"].is_null() ? 4 : (uint32_t)config["client_iothread_count"].uint64_value();
|
||||
else
|
||||
this->iothread_count = (uint32_t)config["osd_iothread_count"].uint64_value();
|
||||
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
||||
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
|
||||
this->receive_buffer_size = 65536;
|
||||
@@ -255,6 +512,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
{
|
||||
fprintf(stderr, "Connecting to OSD %ju at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
|
||||
}
|
||||
clients[peer_fd]->msgr = this;
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = peer_port;
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
@@ -262,7 +520,8 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
clients[peer_fd]->connect_timeout_id = -1;
|
||||
clients[peer_fd]->osd_num = peer_osd;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Either OUT (connected) or HUP
|
||||
handle_connect_epoll(peer_fd);
|
||||
@@ -303,7 +562,11 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
cl->peer_state = PEER_CONNECTED;
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
if (iothreads.size())
|
||||
{
|
||||
iothreads[peer_fd % iothreads.size()]->add_client(cl);
|
||||
}
|
||||
tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
});
|
||||
@@ -487,7 +750,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
fprintf(stderr, "Connected to OSD %ju using RDMA\n", cl->osd_num);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(cl->peer_fd, 0, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
@@ -522,13 +785,19 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
clients[peer_fd]->msgr = this;
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
|
||||
// Add FD to epoll
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
if (iothreads.size())
|
||||
{
|
||||
iothreads[peer_fd % iothreads.size()]->add_client(clients[peer_fd]);
|
||||
}
|
||||
tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
});
|
||||
|
@@ -11,6 +11,7 @@
|
||||
#include <map>
|
||||
#include <deque>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "json11/json11.hpp"
|
||||
@@ -45,8 +46,13 @@ struct msgr_rdma_connection_t;
|
||||
struct msgr_rdma_context_t;
|
||||
#endif
|
||||
|
||||
struct osd_messenger_t;
|
||||
|
||||
struct osd_client_t
|
||||
{
|
||||
std::mutex mu;
|
||||
osd_messenger_t *msgr = NULL;
|
||||
|
||||
int refs = 0;
|
||||
|
||||
sockaddr_storage peer_addr;
|
||||
@@ -59,6 +65,7 @@ struct osd_client_t
|
||||
osd_num_t osd_num = 0;
|
||||
|
||||
void *in_buf = NULL;
|
||||
uint32_t receive_buffer_size = 0;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
msgr_rdma_connection_t *rdma_conn = NULL;
|
||||
@@ -89,6 +96,17 @@ struct osd_client_t
|
||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||
|
||||
~osd_client_t();
|
||||
|
||||
bool try_send(ring_loop_t *ringloop, bool use_sync_send_recv);
|
||||
int handle_send(int result);
|
||||
|
||||
bool try_recv(ring_loop_t *ringloop, bool use_sync_send_recv);
|
||||
int handle_read(int result);
|
||||
bool handle_read_buffer(void *curbuf, int remain);
|
||||
bool handle_finished_read();
|
||||
void handle_op_hdr();
|
||||
bool handle_reply_hdr();
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
@@ -111,6 +129,53 @@ struct osd_op_stats_t
|
||||
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
|
||||
};
|
||||
|
||||
#ifdef __MOCK__
|
||||
class msgr_iothread_t;
|
||||
#else
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
|
||||
class msgr_iothread_t
|
||||
{
|
||||
protected:
|
||||
ring_loop_t *ring = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
ring_consumer_t consumer;
|
||||
int submit_eventfd = -1;
|
||||
bool stopped = false;
|
||||
std::mutex mu;
|
||||
std::map<int, osd_client_t*> clients;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::mutex write_ready_mu;
|
||||
std::vector<int> write_ready_clients;
|
||||
std::thread *thread = NULL;
|
||||
|
||||
void run();
|
||||
|
||||
void read_requests();
|
||||
|
||||
void send_replies();
|
||||
|
||||
public:
|
||||
|
||||
void handle_client_read(osd_client_t *cl, int res);
|
||||
void handle_client_send(osd_client_t *cl, int res);
|
||||
|
||||
msgr_iothread_t();
|
||||
~msgr_iothread_t();
|
||||
|
||||
void add_client(osd_client_t *cl);
|
||||
|
||||
void remove_client(osd_client_t *cl);
|
||||
|
||||
void wakeup_out(int peer_fd, ring_loop_t *outer_ring);
|
||||
|
||||
void stop();
|
||||
};
|
||||
#endif
|
||||
|
||||
struct osd_messenger_t
|
||||
{
|
||||
protected:
|
||||
@@ -123,6 +188,7 @@ protected:
|
||||
int osd_ping_timeout = 0;
|
||||
int log_level = 0;
|
||||
bool use_sync_send_recv = false;
|
||||
int iothread_count = 0;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
bool use_rdma = true;
|
||||
@@ -134,10 +200,13 @@ protected:
|
||||
bool rdma_odp = false;
|
||||
#endif
|
||||
|
||||
std::vector<msgr_iothread_t*> iothreads;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
int immediates_eventfd = -1;
|
||||
std::mutex immediates_mu;
|
||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
std::vector<std::function<void()>> immediates;
|
||||
|
||||
public:
|
||||
timerfd_manager_t *tfd;
|
||||
@@ -155,10 +224,13 @@ public:
|
||||
void parse_config(const json11::Json & config);
|
||||
void connect_peer(uint64_t osd_num, json11::Json peer_state);
|
||||
void stop_client(int peer_fd, bool force = false, bool force_delete = false);
|
||||
void stop_client_from_iothread(osd_client_t *cl);
|
||||
void outbox_push(osd_op_t *cur_op);
|
||||
std::function<void(osd_op_t*)> exec_op;
|
||||
std::function<void(osd_num_t)> repeer_pgs;
|
||||
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
|
||||
void handle_client_read(osd_client_t *cl, int res);
|
||||
void handle_client_send(osd_client_t *cl, int res);
|
||||
void read_requests();
|
||||
void send_replies();
|
||||
void accept_connections(int listen_fd);
|
||||
@@ -178,6 +250,9 @@ public:
|
||||
void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len);
|
||||
void measure_exec(osd_op_t *cur_op);
|
||||
|
||||
void set_immediate(std::function<void()> cb);
|
||||
void set_immediate_or_run(std::function<void()> cb);
|
||||
|
||||
protected:
|
||||
void try_connect_peer(uint64_t osd_num);
|
||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||
@@ -188,15 +263,7 @@ protected:
|
||||
void cancel_osd_ops(osd_client_t *cl);
|
||||
void cancel_op(osd_op_t *op);
|
||||
|
||||
bool try_send(osd_client_t *cl);
|
||||
void handle_send(int result, osd_client_t *cl);
|
||||
|
||||
bool handle_read(int result, osd_client_t *cl);
|
||||
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||
bool handle_finished_read(osd_client_t *cl);
|
||||
void handle_op_hdr(osd_client_t *cl);
|
||||
bool handle_reply_hdr(osd_client_t *cl);
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
void handle_immediates();
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
void try_send_rdma(osd_client_t *cl);
|
||||
@@ -205,4 +272,6 @@ protected:
|
||||
bool try_recv_rdma(osd_client_t *cl);
|
||||
void handle_rdma_events();
|
||||
#endif
|
||||
|
||||
friend struct osd_client_t;
|
||||
};
|
||||
|
@@ -603,7 +603,7 @@ void osd_messenger_t::handle_rdma_events()
|
||||
if (!is_send)
|
||||
{
|
||||
rc->cur_recv--;
|
||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||
if (!cl->handle_read_buffer(rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||
{
|
||||
// handle_read_buffer may stop the client
|
||||
continue;
|
||||
@@ -666,9 +666,5 @@ void osd_messenger_t::handle_rdma_events()
|
||||
}
|
||||
}
|
||||
} while (event_count > 0);
|
||||
for (auto cb: set_immediate)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
set_immediate.clear();
|
||||
handle_immediates();
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <unistd.h>
|
||||
#include "messenger.h"
|
||||
|
||||
void osd_messenger_t::read_requests()
|
||||
@@ -9,63 +10,119 @@ void osd_messenger_t::read_requests()
|
||||
{
|
||||
int peer_fd = read_ready_clients[i];
|
||||
osd_client_t *cl = clients[peer_fd];
|
||||
if (cl->read_msg.msg_iovlen)
|
||||
if (!cl->try_recv(ringloop, use_sync_send_recv))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (cl->read_remaining < receive_buffer_size)
|
||||
{
|
||||
cl->read_iov.iov_base = cl->in_buf;
|
||||
cl->read_iov.iov_len = receive_buffer_size;
|
||||
cl->read_msg.msg_iov = &cl->read_iov;
|
||||
cl->read_msg.msg_iovlen = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->read_iov.iov_base = 0;
|
||||
cl->read_iov.iov_len = cl->read_remaining;
|
||||
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
|
||||
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
|
||||
}
|
||||
cl->refs++;
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
||||
return;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_read(data->res, cl); };
|
||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
int result = recvmsg(peer_fd, &cl->read_msg, 0);
|
||||
if (result < 0)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
handle_read(result, cl);
|
||||
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
read_ready_clients.clear();
|
||||
if (!iothreads.size())
|
||||
{
|
||||
handle_immediates();
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||
bool osd_client_t::try_recv(ring_loop_t *ringloop, bool use_sync_send_recv)
|
||||
{
|
||||
bool ret = false;
|
||||
auto cl = this;
|
||||
if (cl->read_msg.msg_iovlen)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (cl->read_remaining < cl->receive_buffer_size)
|
||||
{
|
||||
cl->read_iov.iov_base = cl->in_buf;
|
||||
cl->read_iov.iov_len = cl->receive_buffer_size;
|
||||
cl->read_msg.msg_iov = &cl->read_iov;
|
||||
cl->read_msg.msg_iovlen = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->read_iov.iov_base = 0;
|
||||
cl->read_iov.iov_len = cl->read_remaining;
|
||||
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
|
||||
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
|
||||
}
|
||||
cl->refs++;
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
return false;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
if (msgr->iothreads.size())
|
||||
{
|
||||
data->callback = [this](ring_data_t *data) { msgr->iothreads[peer_fd % msgr->iothreads.size()]->handle_client_read(this, data->res); };
|
||||
}
|
||||
else
|
||||
{
|
||||
data->callback = [this](ring_data_t *data) { msgr->handle_client_read(this, data->res); };
|
||||
}
|
||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
int result = recvmsg(peer_fd, &cl->read_msg, 0);
|
||||
if (result < 0)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
msgr->handle_client_read(this, result);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_client_read(osd_client_t *cl, int res)
|
||||
{
|
||||
res = cl->handle_read(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
delete cl;
|
||||
}
|
||||
else if (res == -EIO)
|
||||
{
|
||||
stop_client(cl->peer_fd);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::handle_client_read(osd_client_t *cl, int res)
|
||||
{
|
||||
cl->mu.lock();
|
||||
res = cl->handle_read(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
cl->msgr->set_immediate([cl]() { delete cl; });
|
||||
}
|
||||
cl->mu.unlock();
|
||||
if (res == -EIO)
|
||||
{
|
||||
cl->msgr->stop_client_from_iothread(cl);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
ring->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
int osd_client_t::handle_read(int result)
|
||||
{
|
||||
auto cl = this;
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
return false;
|
||||
return -ENOENT;
|
||||
}
|
||||
if (result <= 0 && result != -EAGAIN && result != -EINTR)
|
||||
{
|
||||
@@ -74,27 +131,14 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||
{
|
||||
fprintf(stderr, "Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||
}
|
||||
stop_client(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
if (result == -EAGAIN || result == -EINTR || result < cl->read_iov.iov_len)
|
||||
{
|
||||
cl->read_ready--;
|
||||
if (cl->read_ready > 0)
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
}
|
||||
else
|
||||
{
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
return -EIO;
|
||||
}
|
||||
int expected = cl->read_iov.iov_len;
|
||||
if (result > 0)
|
||||
{
|
||||
if (cl->read_iov.iov_base == cl->in_buf)
|
||||
{
|
||||
if (!handle_read_buffer(cl, cl->in_buf, result))
|
||||
{
|
||||
goto fin;
|
||||
}
|
||||
handle_read_buffer(cl->in_buf, result);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -103,28 +147,25 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||
cl->recv_list.eat(result);
|
||||
if (cl->recv_list.done >= cl->recv_list.count)
|
||||
{
|
||||
if (!handle_finished_read(cl))
|
||||
{
|
||||
goto fin;
|
||||
}
|
||||
handle_finished_read();
|
||||
}
|
||||
}
|
||||
if (result >= cl->read_iov.iov_len)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
fin:
|
||||
for (auto cb: set_immediate)
|
||||
if (result == -EAGAIN || result == -EINTR || result < expected)
|
||||
{
|
||||
cb();
|
||||
cl->read_ready--;
|
||||
assert(cl->read_ready >= 0);
|
||||
}
|
||||
set_immediate.clear();
|
||||
return ret;
|
||||
if (cl->read_ready > 0)
|
||||
{
|
||||
return -EAGAIN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
|
||||
bool osd_client_t::handle_read_buffer(void *curbuf, int remain)
|
||||
{
|
||||
auto cl = this;
|
||||
// Compose operation(s) from the buffer
|
||||
while (remain > 0)
|
||||
{
|
||||
@@ -160,7 +201,7 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
|
||||
}
|
||||
if (cl->recv_list.done >= cl->recv_list.count)
|
||||
{
|
||||
if (!handle_finished_read(cl))
|
||||
if (!handle_finished_read())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -169,19 +210,20 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
|
||||
return true;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
bool osd_client_t::handle_finished_read()
|
||||
{
|
||||
auto cl = this;
|
||||
cl->recv_list.reset();
|
||||
if (cl->read_state == CL_READ_HDR)
|
||||
{
|
||||
if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
|
||||
return handle_reply_hdr(cl);
|
||||
return handle_reply_hdr();
|
||||
else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
|
||||
handle_op_hdr(cl);
|
||||
handle_op_hdr();
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
||||
stop_client(cl->peer_fd);
|
||||
msgr->stop_client_from_iothread(cl);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -189,7 +231,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
{
|
||||
// Operation is ready
|
||||
cl->received_ops.push_back(cl->read_op);
|
||||
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
|
||||
msgr->set_immediate([msgr = this->msgr, op = cl->read_op, cl]() { msgr->exec_op(op); });
|
||||
cl->read_op = NULL;
|
||||
cl->read_state = 0;
|
||||
}
|
||||
@@ -207,8 +249,9 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
void osd_client_t::handle_op_hdr()
|
||||
{
|
||||
auto cl = this;
|
||||
osd_op_t *cur_op = cl->read_op;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
|
||||
{
|
||||
@@ -285,20 +328,21 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
{
|
||||
// Operation is ready
|
||||
cl->received_ops.push_back(cur_op);
|
||||
set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
|
||||
msgr->set_immediate([msgr = this->msgr, cur_op, cl]() { msgr->exec_op(cur_op); });
|
||||
cl->read_op = NULL;
|
||||
cl->read_state = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
bool osd_client_t::handle_reply_hdr()
|
||||
{
|
||||
auto cl = this;
|
||||
auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
|
||||
if (req_it == cl->sent_ops.end())
|
||||
{
|
||||
// Command out of sync. Drop connection
|
||||
fprintf(stderr, "Client %d command out of sync: id %ju\n", cl->peer_fd, cl->read_op->req.hdr.id);
|
||||
stop_client(cl->peer_fd);
|
||||
msgr->stop_client_from_iothread(cl);
|
||||
return false;
|
||||
}
|
||||
osd_op_t *op = req_it->second;
|
||||
@@ -315,7 +359,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %jd+%u\n",
|
||||
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
|
||||
cl->sent_ops[op->req.hdr.id] = op;
|
||||
stop_client(cl->peer_fd);
|
||||
msgr->stop_client_from_iothread(cl);
|
||||
return false;
|
||||
}
|
||||
if (bmp_len > 0)
|
||||
@@ -391,24 +435,92 @@ reuse:
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_reply_ready(osd_op_t *op)
|
||||
void osd_client_t::handle_reply_ready(osd_op_t *op)
|
||||
{
|
||||
// Measure subop latency
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
if (!stats.subop_stat_count[op->req.hdr.opcode])
|
||||
msgr->set_immediate([msgr = this->msgr, op, cl = this]()
|
||||
{
|
||||
// Measure subop latency
|
||||
auto & stats = msgr->stats;
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
|
||||
}
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
set_immediate.push_back([op]()
|
||||
{
|
||||
if (!stats.subop_stat_count[op->req.hdr.opcode])
|
||||
{
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
|
||||
}
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
});
|
||||
}
|
||||
|
||||
static uint64_t one = 1;
|
||||
|
||||
void osd_messenger_t::set_immediate(std::function<void()> cb/*, ring_loop_t *ringloop*/)
|
||||
{
|
||||
if (!iothreads.size())
|
||||
{
|
||||
immediates.push_back(cb);
|
||||
return;
|
||||
}
|
||||
immediates_mu.lock();
|
||||
bool wakeup_main_thread = !immediates.size();
|
||||
immediates.push_back(cb);
|
||||
immediates_mu.unlock();
|
||||
if (wakeup_main_thread)
|
||||
{
|
||||
// io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
|
||||
// if (!sqe)
|
||||
// {
|
||||
write(immediates_eventfd, &one, sizeof(one));
|
||||
// FIXME: Can't use ringloop here, oops
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
// data->callback = [](ring_data_t*){};
|
||||
// my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::set_immediate_or_run(std::function<void()> cb/*, ring_loop_t *ringloop*/)
|
||||
{
|
||||
if (!iothreads.size())
|
||||
{
|
||||
cb();
|
||||
return;
|
||||
}
|
||||
immediates_mu.lock();
|
||||
bool wakeup_main_thread = !immediates.size();
|
||||
immediates.push_back(cb);
|
||||
immediates_mu.unlock();
|
||||
if (wakeup_main_thread)
|
||||
{
|
||||
// io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
|
||||
// if (!sqe)
|
||||
// {
|
||||
write(immediates_eventfd, &one, sizeof(one));
|
||||
// FIXME: Can't use ringloop here, oops
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
// data->callback = [](ring_data_t*){};
|
||||
// my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_immediates()
|
||||
{
|
||||
auto to_run = std::move(immediates);
|
||||
for (auto & cb: to_run)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
}
|
||||
|
@@ -15,10 +15,17 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
||||
}
|
||||
else
|
||||
else if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
measure_exec(cur_op);
|
||||
}
|
||||
if (iothreads.size())
|
||||
{
|
||||
cl->mu.lock();
|
||||
}
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
// Check that operation actually belongs to this client
|
||||
// FIXME: Review if this is still needed
|
||||
bool found = false;
|
||||
for (auto it = cl->received_ops.begin(); it != cl->received_ops.end(); it++)
|
||||
{
|
||||
@@ -32,6 +39,10 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
if (!found)
|
||||
{
|
||||
delete cur_op;
|
||||
if (iothreads.size())
|
||||
{
|
||||
cl->mu.unlock();
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -39,7 +50,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
measure_exec(cur_op);
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->reply.buf, .iov_len = OSD_PACKET_SIZE });
|
||||
}
|
||||
else
|
||||
@@ -108,21 +118,36 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->peer_state == PEER_RDMA)
|
||||
{
|
||||
if (iothreads.size())
|
||||
{
|
||||
cl->mu.unlock();
|
||||
}
|
||||
try_send_rdma(cl);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (!ringloop)
|
||||
if (iothreads.size())
|
||||
{
|
||||
int should_wakeup = !cl->write_msg.msg_iovlen && !cl->write_state;
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
cl->mu.unlock();
|
||||
if (should_wakeup)
|
||||
{
|
||||
auto iot = iothreads[cl->peer_fd % iothreads.size()];
|
||||
iot->wakeup_out(cl->peer_fd, ringloop);
|
||||
}
|
||||
}
|
||||
else if (!ringloop)
|
||||
{
|
||||
// FIXME: It's worse because it doesn't allow batching
|
||||
while (cl->outbox.size())
|
||||
{
|
||||
try_send(cl);
|
||||
cl->try_send(NULL, true);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
|
||||
if ((cl->write_msg.msg_iovlen > 0 || !cl->try_send(ringloop, use_sync_send_recv)) && (cl->write_state == 0))
|
||||
{
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
@@ -180,8 +205,9 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
bool osd_client_t::try_send(ring_loop_t *ringloop, bool use_sync_send_recv)
|
||||
{
|
||||
auto cl = this;
|
||||
int peer_fd = cl->peer_fd;
|
||||
if (!cl->send_list.size() || cl->write_msg.msg_iovlen > 0)
|
||||
{
|
||||
@@ -198,7 +224,14 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||
cl->refs++;
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
||||
if (msgr->iothreads.size())
|
||||
{
|
||||
data->callback = [this](ring_data_t *data) { msgr->iothreads[this->peer_fd % msgr->iothreads.size()]->handle_client_send(this, data->res); };
|
||||
}
|
||||
else
|
||||
{
|
||||
data->callback = [this](ring_data_t *data) { msgr->handle_client_send(this, data->res); };
|
||||
}
|
||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
||||
}
|
||||
else
|
||||
@@ -211,18 +244,68 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
handle_send(result, cl);
|
||||
msgr->handle_client_send(this, result);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_client_send(osd_client_t *cl, int res)
|
||||
{
|
||||
res = cl->handle_send(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
delete cl;
|
||||
}
|
||||
else if (res == -EIO)
|
||||
{
|
||||
stop_client(cl->peer_fd);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
write_ready_clients.push_back(cl->peer_fd);
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::handle_client_send(osd_client_t *cl, int res)
|
||||
{
|
||||
cl->mu.lock();
|
||||
res = cl->handle_send(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
cl->msgr->set_immediate([cl]() { delete cl; });
|
||||
}
|
||||
cl->mu.unlock();
|
||||
if (res == -EIO)
|
||||
{
|
||||
cl->msgr->stop_client_from_iothread(cl);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
write_ready_mu.lock();
|
||||
write_ready_clients.push_back(cl->peer_fd);
|
||||
write_ready_mu.unlock();
|
||||
ring->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::send_replies()
|
||||
{
|
||||
if (iothreads.size())
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||
{
|
||||
int peer_fd = write_ready_clients[i];
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end() && !try_send(cl_it->second))
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto cl = cl_it->second;
|
||||
if (!cl->try_send(ringloop, use_sync_send_recv))
|
||||
{
|
||||
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
|
||||
return;
|
||||
@@ -231,24 +314,20 @@ void osd_messenger_t::send_replies()
|
||||
write_ready_clients.clear();
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
int osd_client_t::handle_send(int result)
|
||||
{
|
||||
auto cl = this;
|
||||
cl->write_msg.msg_iovlen = 0;
|
||||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
return;
|
||||
return -ENOENT;
|
||||
}
|
||||
if (result < 0 && result != -EAGAIN && result != -EINTR)
|
||||
{
|
||||
// this is a client socket, so don't panic. just disconnect it
|
||||
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||
stop_client(cl->peer_fd);
|
||||
return;
|
||||
return -EIO;
|
||||
}
|
||||
if (result >= 0)
|
||||
{
|
||||
@@ -261,7 +340,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||
{
|
||||
// Reply fully sent
|
||||
delete cl->outbox[done].op;
|
||||
msgr->set_immediate_or_run([op = cl->outbox[done].op] { delete op; });
|
||||
}
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
@@ -291,26 +370,35 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
{
|
||||
// FIXME: Do something better than just forgetting the FD
|
||||
// FIXME: Ignore pings during RDMA state transition
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
msgr->set_immediate_or_run([cl = this, msgr = this->msgr, peer_fd = this->peer_fd]()
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
auto cl_it = msgr->clients.find(peer_fd);
|
||||
if (cl_it == msgr->clients.end() || cl_it->second != cl)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
return;
|
||||
}
|
||||
if (msgr->log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", peer_fd);
|
||||
}
|
||||
msgr->tfd->set_fd_handler(peer_fd, 0, [msgr](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
msgr->handle_peer_epoll(peer_fd, epoll_events);
|
||||
}
|
||||
});
|
||||
// Add the initial receive request
|
||||
msgr->try_recv_rdma(cl);
|
||||
});
|
||||
// Add the initial receive request
|
||||
try_recv_rdma(cl);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (cl->write_state != 0)
|
||||
{
|
||||
write_ready_clients.push_back(cl->peer_fd);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@@ -11,6 +11,7 @@
|
||||
|
||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
{
|
||||
cl->mu.lock();
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
cancel_ops.resize(cl->sent_ops.size());
|
||||
int i = 0;
|
||||
@@ -20,6 +21,7 @@ void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
}
|
||||
cl->sent_ops.clear();
|
||||
cl->outbox.clear();
|
||||
cl->mu.unlock();
|
||||
for (auto op: cancel_ops)
|
||||
{
|
||||
cancel_op(op);
|
||||
@@ -53,8 +55,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
return;
|
||||
}
|
||||
osd_client_t *cl = it->second;
|
||||
cl->mu.lock();
|
||||
if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
cl->mu.unlock();
|
||||
return;
|
||||
}
|
||||
if (log_level > 0)
|
||||
@@ -71,6 +75,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
||||
cl->refs++;
|
||||
cl->peer_state = PEER_STOPPED;
|
||||
cl->mu.unlock();
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// ...and forget OSD peer
|
||||
@@ -78,7 +83,11 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// Then remove FD from the eventloop so we don't accidentally read something
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(peer_fd, 0, NULL);
|
||||
if (iothreads.size())
|
||||
{
|
||||
iothreads[peer_fd % iothreads.size()]->remove_client(cl);
|
||||
}
|
||||
if (cl->connect_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(cl->connect_timeout_id);
|
||||
@@ -108,17 +117,24 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
repeer_pgs(cl->osd_num);
|
||||
}
|
||||
// Then cancel all operations
|
||||
cl->mu.lock();
|
||||
if (cl->read_op)
|
||||
{
|
||||
if (!cl->read_op->callback)
|
||||
auto op = cl->read_op;
|
||||
cl->read_op = NULL;
|
||||
cl->mu.unlock();
|
||||
if (!op->callback)
|
||||
{
|
||||
delete cl->read_op;
|
||||
delete op;
|
||||
}
|
||||
else
|
||||
{
|
||||
cancel_op(cl->read_op);
|
||||
cancel_op(op);
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->mu.unlock();
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
@@ -131,11 +147,32 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
{
|
||||
clients.erase(it);
|
||||
}
|
||||
cl->mu.lock();
|
||||
cl->refs--;
|
||||
if (cl->refs <= 0 || force_delete)
|
||||
{
|
||||
cl->mu.unlock();
|
||||
delete cl;
|
||||
}
|
||||
else
|
||||
cl->mu.unlock();
|
||||
}
|
||||
|
||||
void osd_messenger_t::stop_client_from_iothread(osd_client_t *cl)
|
||||
{
|
||||
if (!iothreads.size())
|
||||
{
|
||||
stop_client(cl->peer_fd);
|
||||
return;
|
||||
}
|
||||
set_immediate([this, cl, peer_fd = cl->peer_fd]()
|
||||
{
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end() && cl_it->second == cl)
|
||||
{
|
||||
stop_client(peer_fd);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
osd_client_t::~osd_client_t()
|
||||
|
@@ -655,7 +655,7 @@ help:
|
||||
ringloop->register_consumer(&consumer);
|
||||
// Add FD to epoll
|
||||
bool stop = false;
|
||||
epmgr->tfd->set_fd_handler(sockfd[0], false, [this, &stop](int peer_fd, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(sockfd[0], EPOLLIN, [this, &stop](int peer_fd, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
|
@@ -12,6 +12,7 @@ add_library(vitastor_cli STATIC
|
||||
cli_ls.cpp
|
||||
cli_create.cpp
|
||||
cli_modify.cpp
|
||||
cli_osd_tree.cpp
|
||||
cli_flatten.cpp
|
||||
cli_merge.cpp
|
||||
cli_rm_data.cpp
|
||||
|
@@ -118,6 +118,12 @@ static const char* help_text =
|
||||
" With --dry-run only checks if deletion is possible without data loss and\n"
|
||||
" redundancy degradation.\n"
|
||||
"\n"
|
||||
"vitastor-cli osd-tree\n"
|
||||
" Show current OSD tree.\n"
|
||||
"\n"
|
||||
"vitastor-cli osds|ls-osd|osd-ls\n"
|
||||
" Show current OSDs as list.\n"
|
||||
"\n"
|
||||
"vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]\n"
|
||||
" Create a pool. Required parameters:\n"
|
||||
" -s|--pg_size R Number of replicas for replicated pools\n"
|
||||
@@ -389,6 +395,17 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
// Allocate a new OSD number
|
||||
action_cb = p->start_alloc_osd(cfg);
|
||||
}
|
||||
else if (cmd[0] == "osd-tree")
|
||||
{
|
||||
// Print OSD tree
|
||||
action_cb = p->start_osd_tree(cfg);
|
||||
}
|
||||
else if (cmd[0] == "osds" || cmd[0] == "ls-osds" || cmd[0] == "ls-osd" || cmd[0] == "osd-ls")
|
||||
{
|
||||
// Print OSD list
|
||||
cfg["flat"] = true;
|
||||
action_cb = p->start_osd_tree(cfg);
|
||||
}
|
||||
else if (cmd[0] == "create-pool" || cmd[0] == "pool-create")
|
||||
{
|
||||
// Create a new pool
|
||||
|
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "json11/json11.hpp"
|
||||
#include "object_id.h"
|
||||
#include "osd_id.h"
|
||||
#include "ringloop.h"
|
||||
#include <functional>
|
||||
|
||||
@@ -56,27 +57,31 @@ public:
|
||||
friend struct snap_flattener_t;
|
||||
friend struct snap_remover_t;
|
||||
|
||||
std::function<bool(cli_result_t &)> start_status(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_create(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_describe(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_fix(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_create(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_modify(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm_data(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_merge(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_flatten(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm_osd(json11::Json cfg);
|
||||
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
|
||||
std::function<bool(cli_result_t &)> start_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_merge(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_modify(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_osd_tree(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pool_create(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pool_modify(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pool_rm(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pool_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm_data(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm_osd(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_status(json11::Json);
|
||||
|
||||
// Should be called like loop_and_wait(start_status(), <completion callback>)
|
||||
void loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb);
|
||||
|
||||
void etcd_txn(json11::Json txn);
|
||||
|
||||
void iterate_kvs_1(json11::Json kvs, const std::string & prefix, std::function<void(uint64_t num, json11::Json)> cb);
|
||||
void iterate_kvs_2(json11::Json kvs, const std::string & prefix, std::function<void(pool_id_t pool_id, uint64_t num, json11::Json)> cb);
|
||||
};
|
||||
|
||||
std::string print_table(json11::Json items, json11::Json header, bool use_esc);
|
||||
|
@@ -72,19 +72,10 @@ struct alloc_osd_t
|
||||
if (!parent->etcd_result["succeeded"].bool_value())
|
||||
{
|
||||
std::vector<osd_num_t> used;
|
||||
for (auto kv: parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items())
|
||||
parent->iterate_kvs_1(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t cur_osd, json11::Json value)
|
||||
{
|
||||
std::string key = base64_decode(kv["key"].string_value());
|
||||
osd_num_t cur_osd;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%ju%c", &cur_osd, &null_byte);
|
||||
if (scanned != 1 || !cur_osd)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", key.c_str());
|
||||
continue;
|
||||
}
|
||||
used.push_back(cur_osd);
|
||||
}
|
||||
});
|
||||
std::sort(used.begin(), used.end());
|
||||
if (used[used.size()-1] == used.size())
|
||||
{
|
||||
|
@@ -165,3 +165,43 @@ void cli_tool_t::loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std:
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void cli_tool_t::iterate_kvs_1(json11::Json kvs, const std::string & prefix, std::function<void(uint64_t, json11::Json)> cb)
|
||||
{
|
||||
bool is_pool = prefix == "/pool/stats/";
|
||||
for (auto & kv_item: kvs.array_items())
|
||||
{
|
||||
auto kv = cli->st_cli.parse_etcd_kv(kv_item);
|
||||
uint64_t num = 0;
|
||||
char null_byte = 0;
|
||||
// OSD or pool number
|
||||
int scanned = sscanf(kv.key.substr(cli->st_cli.etcd_prefix.size() + prefix.size()).c_str(), "%ju%c", &num, &null_byte);
|
||||
if (scanned != 1 || !num || is_pool && num >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
cb(num, kv.value);
|
||||
}
|
||||
}
|
||||
|
||||
void cli_tool_t::iterate_kvs_2(json11::Json kvs, const std::string & prefix, std::function<void(pool_id_t pool_id, uint64_t num, json11::Json)> cb)
|
||||
{
|
||||
bool is_inode = prefix == "/config/inode/" || prefix == "/inode/stats/";
|
||||
for (auto & kv_item: kvs.array_items())
|
||||
{
|
||||
auto kv = cli->st_cli.parse_etcd_kv(kv_item);
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t num = 0;
|
||||
char null_byte = 0;
|
||||
// pool+pg or pool+inode
|
||||
int scanned = sscanf(kv.key.substr(cli->st_cli.etcd_prefix.size() + prefix.size()).c_str(),
|
||||
"%u/%ju%c", &pool_id, &num, &null_byte);
|
||||
if (scanned != 2 || !pool_id || is_inode && INODE_POOL(num) || !is_inode && num >= UINT32_MAX)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
cb(pool_id, num, kv.value);
|
||||
}
|
||||
}
|
||||
|
@@ -479,10 +479,14 @@ struct snap_merger_t
|
||||
{
|
||||
if (op->retval != op->len)
|
||||
{
|
||||
rwo->error_code = -op->retval;
|
||||
rwo->error_code = op->retval;
|
||||
rwo->error_offset = op->offset;
|
||||
rwo->error_read = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
rwo->error_code = 0;
|
||||
}
|
||||
continue_rwo.push_back(rwo);
|
||||
parent->ringloop->wakeup();
|
||||
};
|
||||
@@ -553,12 +557,15 @@ struct snap_merger_t
|
||||
if (use_cas && subop->retval == -EINTR)
|
||||
{
|
||||
// CAS failure - reread and repeat optimistically
|
||||
assert(rwo->todo == 1); // initial refcount from read_and_write
|
||||
rwo->error_code = -EINTR;
|
||||
rwo->start = rwo->end = 0;
|
||||
rwo->op.version = 0;
|
||||
rwo_read(rwo);
|
||||
delete subop;
|
||||
return;
|
||||
}
|
||||
rwo->error_code = -subop->retval;
|
||||
rwo->error_code = subop->retval;
|
||||
rwo->error_offset = subop->offset;
|
||||
rwo->error_read = false;
|
||||
}
|
||||
@@ -633,7 +640,7 @@ struct snap_merger_t
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Error %s target at offset %jx: %s",
|
||||
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
|
||||
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(-rwo->error_code));
|
||||
rwo_error = std::string(buf);
|
||||
}
|
||||
delete rwo;
|
||||
|
377
src/cmd/cli_osd_tree.cpp
Normal file
377
src/cmd/cli_osd_tree.cpp
Normal file
@@ -0,0 +1,377 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2024
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <ctype.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "epoll_manager.h"
|
||||
#include "pg_states.h"
|
||||
#include "str_util.h"
|
||||
|
||||
struct placement_osd_t
|
||||
{
|
||||
osd_num_t num;
|
||||
std::string parent;
|
||||
std::vector<std::string> tags;
|
||||
uint64_t size;
|
||||
uint64_t free;
|
||||
bool up;
|
||||
double reweight;
|
||||
uint32_t block_size, bitmap_granularity, immediate_commit;
|
||||
};
|
||||
|
||||
struct placement_node_t
|
||||
{
|
||||
std::string name;
|
||||
std::string parent;
|
||||
std::string level;
|
||||
std::vector<std::string> child_nodes;
|
||||
std::vector<osd_num_t> child_osds;
|
||||
};
|
||||
|
||||
struct placement_tree_t
|
||||
{
|
||||
std::map<std::string, placement_node_t> nodes;
|
||||
std::map<osd_num_t, placement_osd_t> osds;
|
||||
};
|
||||
|
||||
struct osd_tree_printer_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
json11::Json cfg;
|
||||
bool flat = false;
|
||||
bool show_stats = false;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
json11::Json node_placement;
|
||||
std::map<uint64_t, json11::Json> osd_config;
|
||||
std::map<uint64_t, json11::Json> osd_stats;
|
||||
std::shared_ptr<placement_tree_t> placement_tree;
|
||||
|
||||
bool is_done() { return state == 100; }
|
||||
|
||||
void load_osd_tree()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/node_placement") },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/") },
|
||||
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd0") },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/") },
|
||||
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats0") },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
});
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
for (auto & item: parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items())
|
||||
{
|
||||
node_placement = parent->cli->st_cli.parse_etcd_kv(item).value;
|
||||
}
|
||||
parent->iterate_kvs_1(parent->etcd_result["responses"][1]["response_range"]["kvs"], "/config/osd/", [&](uint64_t cur_osd, json11::Json value)
|
||||
{
|
||||
osd_config[cur_osd] = value;
|
||||
});
|
||||
parent->iterate_kvs_1(parent->etcd_result["responses"][2]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t cur_osd, json11::Json value)
|
||||
{
|
||||
osd_stats[cur_osd] = value;
|
||||
});
|
||||
placement_tree = make_osd_tree(node_placement, osd_config, osd_stats);
|
||||
}
|
||||
|
||||
std::shared_ptr<placement_tree_t> make_osd_tree(json11::Json node_placement_json,
|
||||
std::map<uint64_t, json11::Json> osd_config, std::map<uint64_t, json11::Json> osd_stats)
|
||||
{
|
||||
auto node_placement = node_placement_json.object_items();
|
||||
auto tree = std::make_shared<placement_tree_t>();
|
||||
tree->nodes[""] = (placement_node_t){};
|
||||
// Add non-OSD items
|
||||
for (auto & kv: node_placement)
|
||||
{
|
||||
auto osd_num = stoull_full(kv.first);
|
||||
if (!osd_num)
|
||||
{
|
||||
auto level = kv.second["level"].string_value();
|
||||
tree->nodes[kv.first] = (placement_node_t){
|
||||
.name = kv.first,
|
||||
.parent = kv.second["parent"].string_value(),
|
||||
.level = level == "" ? "unknown" : level,
|
||||
};
|
||||
}
|
||||
}
|
||||
// Add OSDs
|
||||
for (auto & kv: osd_stats)
|
||||
{
|
||||
auto & osd = tree->osds[kv.first] = (placement_osd_t){
|
||||
.num = kv.first,
|
||||
.parent = kv.second["host"].string_value(),
|
||||
.size = kv.second["size"].uint64_value(),
|
||||
.free = kv.second["free"].uint64_value(),
|
||||
.up = parent->cli->st_cli.peer_states.find(kv.first) != parent->cli->st_cli.peer_states.end(),
|
||||
.reweight = 1,
|
||||
.block_size = (uint32_t)kv.second["data_block_size"].uint64_value(),
|
||||
.bitmap_granularity = (uint32_t)kv.second["bitmap_granularity"].uint64_value(),
|
||||
.immediate_commit = etcd_state_client_t::parse_immediate_commit(kv.second["immediate_commit"].string_value()),
|
||||
};
|
||||
if (tree->nodes.find(osd.parent) == tree->nodes.end())
|
||||
{
|
||||
// Autocreate all hosts
|
||||
tree->nodes[osd.parent] = (placement_node_t){
|
||||
.name = osd.parent,
|
||||
.level = "host",
|
||||
};
|
||||
}
|
||||
auto cfg_it = osd_config.find(osd.num);
|
||||
if (cfg_it != osd_config.end())
|
||||
{
|
||||
auto & osd_cfg = cfg_it->second;
|
||||
osd.reweight = osd_cfg["reweight"].is_number() ? osd_cfg["reweight"].number_value() : 1;
|
||||
if (osd_cfg["tags"].is_array())
|
||||
{
|
||||
for (auto & jtag: osd_cfg["tags"].array_items())
|
||||
osd.tags.push_back(jtag.string_value());
|
||||
}
|
||||
}
|
||||
auto np_it = node_placement.find(std::to_string(osd.num));
|
||||
if (np_it != node_placement.end())
|
||||
{
|
||||
osd.parent = np_it->second["parent"].string_value();
|
||||
}
|
||||
tree->nodes[osd.parent].child_osds.push_back(osd.num);
|
||||
}
|
||||
// Fill child_nodes
|
||||
for (auto & ip: tree->nodes)
|
||||
{
|
||||
if (tree->nodes.find(ip.second.parent) == tree->nodes.end())
|
||||
{
|
||||
ip.second.parent = "";
|
||||
}
|
||||
if (ip.first != "")
|
||||
{
|
||||
tree->nodes[ip.second.parent].child_nodes.push_back(ip.first);
|
||||
}
|
||||
}
|
||||
// FIXME: Maybe filter out loops here
|
||||
return tree;
|
||||
}
|
||||
|
||||
std::string format_tree()
|
||||
{
|
||||
std::vector<std::string> node_seq = { "" };
|
||||
std::vector<int> indents = { -1 };
|
||||
std::map<std::string, bool> seen;
|
||||
for (int i = 0; i < node_seq.size(); i++)
|
||||
{
|
||||
if (seen[node_seq[i]])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
seen[node_seq[i]] = true;
|
||||
auto & child_nodes = placement_tree->nodes.at(node_seq[i]).child_nodes;
|
||||
if (child_nodes.size())
|
||||
{
|
||||
node_seq.insert(node_seq.begin()+i+1, child_nodes.begin(), child_nodes.end());
|
||||
indents.insert(indents.begin()+i+1, child_nodes.size(), indents[i]+1);
|
||||
}
|
||||
}
|
||||
json11::Json::array fmt_items;
|
||||
for (int i = 1; i < node_seq.size(); i++)
|
||||
{
|
||||
auto & node = placement_tree->nodes.at(node_seq[i]);
|
||||
if (!flat)
|
||||
{
|
||||
fmt_items.push_back(json11::Json::object{
|
||||
{ "type", str_repeat(" ", indents[i]) + node.level },
|
||||
{ "name", node.name },
|
||||
});
|
||||
}
|
||||
std::string parent = node.name;
|
||||
if (flat)
|
||||
{
|
||||
auto cur = &placement_tree->nodes.at(node.name);
|
||||
while (cur->parent != "" && cur->parent != node.name)
|
||||
{
|
||||
parent = cur->parent+"/"+parent;
|
||||
cur = &placement_tree->nodes.at(cur->parent);
|
||||
}
|
||||
}
|
||||
for (uint64_t osd_num: node.child_osds)
|
||||
{
|
||||
auto & osd = placement_tree->osds.at(osd_num);
|
||||
auto fmt = json11::Json::object{
|
||||
{ "type", (flat ? "osd" : str_repeat(" ", indents[i]+1) + "osd") },
|
||||
{ "name", osd.num },
|
||||
{ "parent", parent },
|
||||
{ "up", osd.up ? "up" : "down" },
|
||||
{ "size", format_size(osd.size, false, true) },
|
||||
{ "used", format_q(100.0*(osd.size - osd.free)/osd.size)+" %" },
|
||||
{ "reweight", format_q(osd.reweight) },
|
||||
{ "tags", implode(",", osd.tags) },
|
||||
{ "block", format_size(osd.block_size, false, true) },
|
||||
{ "bitmap", format_size(osd.bitmap_granularity, false, true) },
|
||||
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
|
||||
};
|
||||
if (show_stats)
|
||||
{
|
||||
auto op_stat = osd_stats[osd_num]["op_stats"];
|
||||
fmt["read_bw"] = format_size(op_stat["primary_read"]["bps"].uint64_value())+"/s";
|
||||
fmt["write_bw"] = format_size(op_stat["primary_write"]["bps"].uint64_value())+"/s";
|
||||
fmt["delete_bw"] = format_size(op_stat["primary_delete"]["bps"].uint64_value())+"/s";
|
||||
fmt["read_iops"] = format_q(op_stat["primary_read"]["iops"].uint64_value());
|
||||
fmt["write_iops"] = format_q(op_stat["primary_write"]["iops"].uint64_value());
|
||||
fmt["delete_iops"] = format_q(op_stat["primary_delete"]["iops"].uint64_value());
|
||||
fmt["read_lat"] = format_lat(op_stat["primary_read"]["lat"].uint64_value());
|
||||
fmt["write_lat"] = format_lat(op_stat["primary_write"]["lat"].uint64_value());
|
||||
fmt["delete_lat"] = format_lat(op_stat["primary_delete"]["lat"].uint64_value());
|
||||
}
|
||||
fmt_items.push_back(std::move(fmt));
|
||||
}
|
||||
}
|
||||
json11::Json::array cols;
|
||||
if (!flat)
|
||||
{
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "type" },
|
||||
{ "title", "TYPE" },
|
||||
});
|
||||
}
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "name" },
|
||||
{ "title", flat ? "OSD" : "NAME" },
|
||||
});
|
||||
if (flat)
|
||||
{
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "parent" },
|
||||
{ "title", "PARENT" },
|
||||
});
|
||||
}
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "up" },
|
||||
{ "title", "UP" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "size" },
|
||||
{ "title", "SIZE" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "used" },
|
||||
{ "title", "USED%" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "tags" },
|
||||
{ "title", "TAGS" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "reweight" },
|
||||
{ "title", "WEIGHT" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "block" },
|
||||
{ "title", "BLOCK" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "bitmap" },
|
||||
{ "title", "BITMAP" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "commit" },
|
||||
{ "title", "IMM" },
|
||||
});
|
||||
if (show_stats)
|
||||
{
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "read_bw" },
|
||||
{ "title", "READ" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "read_iops" },
|
||||
{ "title", "IOPS" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "read_lat" },
|
||||
{ "title", "LAT" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "write_bw" },
|
||||
{ "title", "WRITE" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "write_iops" },
|
||||
{ "title", "IOPS" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "write_lat" },
|
||||
{ "title", "LAT" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "delete_bw" },
|
||||
{ "title", "DEL" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "delete_iops" },
|
||||
{ "title", "IOPS" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "delete_lat" },
|
||||
{ "title", "LAT" },
|
||||
});
|
||||
}
|
||||
return print_table(fmt_items, cols, parent->color);
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
resume_1:
|
||||
load_osd_tree();
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
result.text = format_tree();
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_osd_tree(json11::Json cfg)
|
||||
{
|
||||
auto osd_tree_printer = new osd_tree_printer_t();
|
||||
osd_tree_printer->parent = this;
|
||||
osd_tree_printer->cfg = cfg;
|
||||
osd_tree_printer->flat = cfg["flat"].bool_value();
|
||||
osd_tree_printer->show_stats = cfg["long"].bool_value();
|
||||
return [osd_tree_printer](cli_result_t & result)
|
||||
{
|
||||
osd_tree_printer->loop();
|
||||
if (osd_tree_printer->is_done())
|
||||
{
|
||||
result = osd_tree_printer->result;
|
||||
delete osd_tree_printer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -104,37 +104,16 @@ resume_1:
|
||||
{
|
||||
config_pools = parent->cli->st_cli.parse_etcd_kv(config_pools).value;
|
||||
}
|
||||
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
|
||||
parent->iterate_kvs_1(space_info["responses"][0]["response_range"]["kvs"], "/pool/stats/", [&](uint64_t pool_id, json11::Json value)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
|
||||
// pool ID
|
||||
pool_id_t pool_id;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
|
||||
if (scanned != 1 || !pool_id || pool_id >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
// pool/stats/<N>
|
||||
pool_stats[pool_id] = kv.value.object_items();
|
||||
}
|
||||
pool_stats[pool_id] = value.object_items();
|
||||
});
|
||||
std::map<pool_id_t, uint64_t> osd_free;
|
||||
for (auto & kv_item: space_info["responses"][1]["response_range"]["kvs"].array_items())
|
||||
parent->iterate_kvs_1(space_info["responses"][1]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t osd_num, json11::Json value)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
|
||||
// osd ID
|
||||
osd_num_t osd_num;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%ju%c", &osd_num, &null_byte);
|
||||
if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
// osd/stats/<N>::free
|
||||
osd_free[osd_num] = kv.value["free"].uint64_value();
|
||||
}
|
||||
osd_free[osd_num] = value["free"].uint64_value();
|
||||
});
|
||||
// Calculate max_avail for each pool
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
@@ -254,29 +233,17 @@ resume_1:
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
auto pg_stats = parent->etcd_result["responses"][0]["response_range"]["kvs"];
|
||||
// Calculate recovery percent
|
||||
std::map<pool_id_t, object_counts_t> counts;
|
||||
for (auto & kv_item: pg_stats.array_items())
|
||||
parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pg/stats/",
|
||||
[&](pool_id_t pool_id, uint64_t pg_num, json11::Json value)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
|
||||
// pool ID & pg number
|
||||
pool_id_t pool_id;
|
||||
pg_num_t pg_num = 0;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
|
||||
"/pg/stats/%u/%u%c", &pool_id, &pg_num, &null_byte);
|
||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
auto & cnt = counts[pool_id];
|
||||
cnt.object_count += kv.value["object_count"].uint64_value();
|
||||
cnt.misplaced_count += kv.value["misplaced_count"].uint64_value();
|
||||
cnt.degraded_count += kv.value["degraded_count"].uint64_value();
|
||||
cnt.incomplete_count += kv.value["incomplete_count"].uint64_value();
|
||||
}
|
||||
cnt.object_count += value["object_count"].uint64_value();
|
||||
cnt.misplaced_count += value["misplaced_count"].uint64_value();
|
||||
cnt.degraded_count += value["degraded_count"].uint64_value();
|
||||
cnt.incomplete_count += value["incomplete_count"].uint64_value();
|
||||
});
|
||||
for (auto & pp: pool_stats)
|
||||
{
|
||||
auto & cnt = counts[pp.first];
|
||||
@@ -317,35 +284,23 @@ resume_1:
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
auto inode_stats = parent->etcd_result["responses"][0]["response_range"]["kvs"];
|
||||
// Performance statistics
|
||||
std::map<pool_id_t, io_stats_t> pool_io;
|
||||
for (auto & kv_item: inode_stats.array_items())
|
||||
parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/inode/stats/",
|
||||
[&](pool_id_t pool_id, uint64_t inode_num, json11::Json value)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
|
||||
// pool ID & inode number
|
||||
pool_id_t pool_id;
|
||||
inode_t only_inode_num;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
|
||||
"/inode/stats/%u/%ju%c", &pool_id, &only_inode_num, &null_byte);
|
||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
auto & io = pool_io[pool_id];
|
||||
io.read_iops += kv.value["read"]["iops"].uint64_value();
|
||||
io.read_bps += kv.value["read"]["bps"].uint64_value();
|
||||
io.read_lat += kv.value["read"]["lat"].uint64_value();
|
||||
io.write_iops += kv.value["write"]["iops"].uint64_value();
|
||||
io.write_bps += kv.value["write"]["bps"].uint64_value();
|
||||
io.write_lat += kv.value["write"]["lat"].uint64_value();
|
||||
io.delete_iops += kv.value["delete"]["iops"].uint64_value();
|
||||
io.delete_bps += kv.value["delete"]["bps"].uint64_value();
|
||||
io.delete_lat += kv.value["delete"]["lat"].uint64_value();
|
||||
io.read_iops += value["read"]["iops"].uint64_value();
|
||||
io.read_bps += value["read"]["bps"].uint64_value();
|
||||
io.read_lat += value["read"]["lat"].uint64_value();
|
||||
io.write_iops += value["write"]["iops"].uint64_value();
|
||||
io.write_bps += value["write"]["bps"].uint64_value();
|
||||
io.write_lat += value["write"]["lat"].uint64_value();
|
||||
io.delete_iops += value["delete"]["iops"].uint64_value();
|
||||
io.delete_bps += value["delete"]["bps"].uint64_value();
|
||||
io.delete_lat += value["delete"]["lat"].uint64_value();
|
||||
io.count++;
|
||||
}
|
||||
});
|
||||
for (auto & pp: pool_stats)
|
||||
{
|
||||
auto & io = pool_io[pp.first];
|
||||
|
@@ -18,7 +18,7 @@ struct status_printer_t
|
||||
cli_tool_t *parent;
|
||||
|
||||
int state = 0;
|
||||
json11::Json::array mon_members, osd_stats;
|
||||
json11::Json::array mon_members;
|
||||
json11::Json agg_stats;
|
||||
std::map<pool_id_t, json11::Json::object> pool_stats;
|
||||
json11::Json::array etcd_states;
|
||||
@@ -93,7 +93,7 @@ resume_2:
|
||||
return;
|
||||
}
|
||||
mon_members = parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items();
|
||||
osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items();
|
||||
auto osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"];
|
||||
if (parent->etcd_result["responses"][2]["response_range"]["kvs"].array_items().size() > 0)
|
||||
{
|
||||
agg_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][2]["response_range"]["kvs"][0]).value;
|
||||
@@ -133,20 +133,11 @@ resume_2:
|
||||
}
|
||||
int osd_count = 0, osd_up = 0;
|
||||
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
|
||||
for (int i = 0; i < osd_stats.size(); i++)
|
||||
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
|
||||
osd_num_t stat_osd_num = 0;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%ju%c", &stat_osd_num, &null_byte);
|
||||
if (scanned != 1 || !stat_osd_num)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
osd_count++;
|
||||
auto osd_size = kv.value["size"].uint64_value();
|
||||
auto osd_free = kv.value["free"].uint64_value();
|
||||
auto osd_size = value["size"].uint64_value();
|
||||
auto osd_free = value["free"].uint64_value();
|
||||
total_raw += osd_size;
|
||||
free_raw += osd_free;
|
||||
if (!osd_free)
|
||||
@@ -164,10 +155,10 @@ resume_2:
|
||||
}
|
||||
else
|
||||
{
|
||||
down_raw += kv.value["size"].uint64_value();
|
||||
free_down_raw += kv.value["free"].uint64_value();
|
||||
down_raw += value["size"].uint64_value();
|
||||
free_down_raw += value["free"].uint64_value();
|
||||
}
|
||||
}
|
||||
});
|
||||
int pool_count = 0, pools_active = 0;
|
||||
std::map<std::string, int> pgs_by_state;
|
||||
std::string pgs_by_state_str;
|
||||
|
@@ -185,7 +185,7 @@ void kv_cli_t::run()
|
||||
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
|
||||
try
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
|
||||
epmgr->tfd->set_fd_handler(0, EPOLLIN, [this](int fd, int events)
|
||||
{
|
||||
if (events & EPOLLIN)
|
||||
{
|
||||
@@ -193,7 +193,7 @@ void kv_cli_t::run()
|
||||
}
|
||||
if (events & EPOLLRDHUP)
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, NULL);
|
||||
epmgr->tfd->set_fd_handler(0, 0, NULL);
|
||||
finished = true;
|
||||
}
|
||||
});
|
||||
|
@@ -189,6 +189,12 @@ void nfs_proxy_t::run(json11::Json cfg)
|
||||
cmd->epmgr = epmgr;
|
||||
cmd->cli = cli;
|
||||
watch_stats();
|
||||
// Init Pseudo-FS before starting client because it depends on inode_change_hook
|
||||
if (fsname == "")
|
||||
{
|
||||
blockfs = new block_fs_state_t();
|
||||
blockfs->init(this, cfg);
|
||||
}
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
@@ -199,13 +205,8 @@ void nfs_proxy_t::run(json11::Json cfg)
|
||||
}
|
||||
// Check default pool
|
||||
check_default_pool();
|
||||
// Check if we're using VitastorFS
|
||||
if (fsname == "")
|
||||
{
|
||||
blockfs = new block_fs_state_t();
|
||||
blockfs->init(this, cfg);
|
||||
}
|
||||
else
|
||||
// Init VitastorFS after starting client because it depends on loaded inode configuration
|
||||
if (fsname != "")
|
||||
{
|
||||
kvfs = new kv_fs_state_t();
|
||||
kvfs->init(this, cfg);
|
||||
@@ -242,7 +243,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
||||
// Create NFS socket and add it to epoll
|
||||
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, &listening_port);
|
||||
fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(nfs_socket, EPOLLIN, [this](int nfs_socket, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
@@ -259,7 +260,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
||||
// Create portmap socket and add it to epoll
|
||||
int portmap_socket = create_and_bind_socket(bind_address, 111, 128, NULL);
|
||||
fcntl(portmap_socket, F_SETFL, fcntl(portmap_socket, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->tfd->set_fd_handler(portmap_socket, false, [this](int portmap_socket, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(portmap_socket, EPOLLIN, [this](int portmap_socket, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
@@ -465,7 +466,7 @@ void nfs_proxy_t::do_accept(int listen_fd)
|
||||
{
|
||||
cli->proc_table.insert(fn);
|
||||
}
|
||||
epmgr->tfd->set_fd_handler(nfs_fd, true, [cli](int nfs_fd, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(nfs_fd, EPOLLIN|EPOLLOUT, [cli](int nfs_fd, int epoll_events)
|
||||
{
|
||||
// Handle incoming event
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
@@ -722,7 +723,7 @@ void nfs_client_t::stop()
|
||||
stopped = true;
|
||||
if (refs <= 0)
|
||||
{
|
||||
parent->epmgr->tfd->set_fd_handler(nfs_fd, true, NULL);
|
||||
parent->epmgr->tfd->set_fd_handler(nfs_fd, 0, NULL);
|
||||
close(nfs_fd);
|
||||
delete this;
|
||||
}
|
||||
|
@@ -141,6 +141,14 @@ void osd_t::parse_config(bool init)
|
||||
config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
|
||||
if (config.find("log_level") == this->config.end())
|
||||
config["log_level"] = 1;
|
||||
if (init)
|
||||
{
|
||||
// OSD number
|
||||
osd_num = config["osd_num"].uint64_value();
|
||||
if (!osd_num)
|
||||
throw std::runtime_error("osd_num is required in the configuration");
|
||||
msgr.osd_num = osd_num;
|
||||
}
|
||||
if (bs)
|
||||
{
|
||||
auto bs_cfg = json_to_bs(config);
|
||||
@@ -150,11 +158,6 @@ void osd_t::parse_config(bool init)
|
||||
msgr.parse_config(config);
|
||||
if (init)
|
||||
{
|
||||
// OSD number
|
||||
osd_num = config["osd_num"].uint64_value();
|
||||
if (!osd_num)
|
||||
throw std::runtime_error("osd_num is required in the configuration");
|
||||
msgr.osd_num = osd_num;
|
||||
// Vital Blockstore parameters
|
||||
bs_block_size = config["block_size"].uint64_value();
|
||||
if (!bs_block_size)
|
||||
@@ -361,7 +364,7 @@ void osd_t::bind_socket()
|
||||
listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
|
||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
|
||||
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
||||
epmgr->set_fd_handler(listen_fd, EPOLLIN, [this](int fd, int events)
|
||||
{
|
||||
msgr.accept_connections(listen_fd);
|
||||
});
|
||||
|
@@ -199,12 +199,14 @@ class osd_t
|
||||
ring_consumer_t consumer;
|
||||
|
||||
// op statistics
|
||||
osd_op_stats_t prev_stats;
|
||||
osd_op_stats_t prev_stats, prev_report_stats;
|
||||
timespec report_stats_ts;
|
||||
std::map<uint64_t, inode_stats_t> inode_stats;
|
||||
std::map<uint64_t, timespec> vanishing_inodes;
|
||||
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
|
||||
recovery_stat_t recovery_stat[2];
|
||||
recovery_stat_t recovery_print_prev[2];
|
||||
recovery_stat_t recovery_report_prev[2];
|
||||
|
||||
// recovery auto-tuning
|
||||
int rtune_timer_id = -1;
|
||||
@@ -252,6 +254,7 @@ class osd_t
|
||||
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
||||
void repeer_pgs(osd_num_t osd_num);
|
||||
void start_pg_peering(pg_t & pg);
|
||||
void drop_dirty_pg_connections(pool_pg_num_t pg);
|
||||
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
||||
void discard_list_subop(osd_op_t *list_op);
|
||||
bool stop_pg(pg_t & pg);
|
||||
|
@@ -180,6 +180,12 @@ json11::Json osd_t::get_statistics()
|
||||
json11::Json::object st;
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
uint64_t ts_diff = 0;
|
||||
if (report_stats_ts.tv_sec != 0)
|
||||
ts_diff = (ts.tv_sec - report_stats_ts.tv_sec + (ts.tv_nsec - report_stats_ts.tv_nsec) / 1000000000);
|
||||
if (!ts_diff)
|
||||
ts_diff = 1;
|
||||
report_stats_ts = ts;
|
||||
char time_str[50] = { 0 };
|
||||
sprintf(time_str, "%jd.%03ld", (uint64_t)ts.tv_sec, ts.tv_nsec/1000000);
|
||||
st["time"] = time_str;
|
||||
@@ -196,33 +202,50 @@ json11::Json osd_t::get_statistics()
|
||||
json11::Json::object op_stats, subop_stats;
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
auto n = (msgr.stats.op_stat_count[i] - prev_report_stats.op_stat_count[i]);
|
||||
op_stats[osd_op_names[i]] = json11::Json::object {
|
||||
{ "count", msgr.stats.op_stat_count[i] },
|
||||
{ "usec", msgr.stats.op_stat_sum[i] },
|
||||
{ "bytes", msgr.stats.op_stat_bytes[i] },
|
||||
{ "lat", (msgr.stats.op_stat_sum[i] - prev_report_stats.op_stat_sum[i]) / (n < 1 ? 1 : n) },
|
||||
{ "bps", (msgr.stats.op_stat_bytes[i] - prev_report_stats.op_stat_bytes[i]) / ts_diff },
|
||||
{ "iops", n / ts_diff },
|
||||
};
|
||||
}
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
auto n = (msgr.stats.subop_stat_count[i] - prev_report_stats.subop_stat_count[i]);
|
||||
subop_stats[osd_op_names[i]] = json11::Json::object {
|
||||
{ "count", msgr.stats.subop_stat_count[i] },
|
||||
{ "usec", msgr.stats.subop_stat_sum[i] },
|
||||
{ "lat", (msgr.stats.subop_stat_sum[i] - prev_report_stats.subop_stat_sum[i]) / (n < 1 ? 1 : n) },
|
||||
{ "iops", n / ts_diff },
|
||||
};
|
||||
}
|
||||
st["op_stats"] = op_stats;
|
||||
st["subop_stats"] = subop_stats;
|
||||
auto n0 = recovery_stat[0].count - recovery_report_prev[0].count;
|
||||
auto n1 = recovery_stat[1].count - recovery_report_prev[1].count;
|
||||
st["recovery_stats"] = json11::Json::object {
|
||||
{ recovery_stat_names[0], json11::Json::object {
|
||||
{ "count", recovery_stat[0].count },
|
||||
{ "bytes", recovery_stat[0].bytes },
|
||||
{ "usec", recovery_stat[0].usec },
|
||||
{ "lat", (recovery_stat[0].usec - recovery_report_prev[0].usec) / (n0 < 1 ? 1 : n0) },
|
||||
{ "bps", (recovery_stat[0].bytes - recovery_report_prev[0].bytes) / ts_diff },
|
||||
{ "iops", n0 / ts_diff },
|
||||
} },
|
||||
{ recovery_stat_names[1], json11::Json::object {
|
||||
{ "count", recovery_stat[1].count },
|
||||
{ "bytes", recovery_stat[1].bytes },
|
||||
{ "usec", recovery_stat[1].usec },
|
||||
{ "lat", (recovery_stat[1].usec - recovery_report_prev[1].usec) / (n1 < 1 ? 1 : n1) },
|
||||
{ "bps", (recovery_stat[1].bytes - recovery_report_prev[1].bytes) / ts_diff },
|
||||
{ "iops", n1 / ts_diff },
|
||||
} },
|
||||
};
|
||||
prev_report_stats = msgr.stats;
|
||||
memcpy(recovery_report_prev, recovery_stat, sizeof(recovery_stat));
|
||||
return st;
|
||||
}
|
||||
|
||||
|
@@ -168,20 +168,15 @@ void osd_t::reset_pg(pg_t & pg)
|
||||
dirty_pgs.erase({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
|
||||
// Repeer on each connect/disconnect peer event
|
||||
void osd_t::start_pg_peering(pg_t & pg)
|
||||
// Drop connections of clients who have this PG in dirty_pgs
|
||||
void osd_t::drop_dirty_pg_connections(pool_pg_num_t pg)
|
||||
{
|
||||
pg.state = PG_PEERING;
|
||||
this->peering_state |= OSD_PEERING_PGS;
|
||||
reset_pg(pg);
|
||||
report_pg_state(pg);
|
||||
// Drop connections of clients who have this PG in dirty_pgs
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
std::vector<int> to_stop;
|
||||
for (auto & cp: msgr.clients)
|
||||
{
|
||||
if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
|
||||
if (cp.second->dirty_pgs.find(pg) != cp.second->dirty_pgs.end())
|
||||
{
|
||||
to_stop.push_back(cp.first);
|
||||
}
|
||||
@@ -191,6 +186,16 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
msgr.stop_client(peer_fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Repeer on each connect/disconnect peer event
|
||||
void osd_t::start_pg_peering(pg_t & pg)
|
||||
{
|
||||
pg.state = PG_PEERING;
|
||||
this->peering_state |= OSD_PEERING_PGS;
|
||||
reset_pg(pg);
|
||||
report_pg_state(pg);
|
||||
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
// Try to connect with current peers if they're up, but we don't have connections to them
|
||||
// Otherwise we may erroneously decide that the pg is incomplete :-)
|
||||
for (auto pg_osd: pg.all_peers)
|
||||
@@ -460,6 +465,7 @@ bool osd_t::stop_pg(pg_t & pg)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
|
@@ -247,6 +247,7 @@ resume_8:
|
||||
finish:
|
||||
if (cur_op->peer_fd)
|
||||
{
|
||||
// FIXME: Do it before executing sync
|
||||
auto it = msgr.clients.find(cur_op->peer_fd);
|
||||
if (it != msgr.clients.end())
|
||||
it->second->dirty_pgs.clear();
|
||||
|
@@ -43,7 +43,7 @@ int main(int narg, char *args[])
|
||||
// Accept new connections
|
||||
int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
|
||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->set_fd_handler(listen_fd, false, [listen_fd, msgr](int fd, int events)
|
||||
epmgr->set_fd_handler(listen_fd, EPOLLIN, [listen_fd, msgr](int fd, int events)
|
||||
{
|
||||
msgr->accept_connections(listen_fd);
|
||||
});
|
||||
|
@@ -43,8 +43,7 @@ void configure_single_pg_pool(cluster_client_t *cli)
|
||||
},
|
||||
});
|
||||
cli->st_cli.on_load_pgs_hook(true);
|
||||
std::map<std::string, etcd_kv_t> changes;
|
||||
cli->st_cli.on_change_hook(changes);
|
||||
cli->st_cli.on_change_pool_config_hook();
|
||||
}
|
||||
|
||||
int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c, std::function<void()> cb = NULL, bool instant = false)
|
||||
@@ -281,7 +280,8 @@ void test1()
|
||||
uint8_t c = offset < 0xE000 ? 0x56 : (offset < 0x10000 ? 0x57 : 0x58);
|
||||
if (((uint8_t*)op->iov.buf[buf_idx].iov_base)[i] != c)
|
||||
{
|
||||
printf("Write replay: mismatch at %ju\n", offset-op->req.rw.offset);
|
||||
printf("Write replay: mismatch at %ju (expected %02x, have %02x)\n", offset-op->req.rw.offset,
|
||||
c, ((uint8_t*)op->iov.buf[buf_idx].iov_base)[i]);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
@@ -290,9 +290,9 @@ void test1()
|
||||
assert(offset == op->req.rw.offset+op->req.rw.len);
|
||||
replay_ops.push_back(op);
|
||||
}
|
||||
if (replay_start != 0 || replay_end != 0x14000)
|
||||
if (replay_start != 0 || replay_end != 0x10000)
|
||||
{
|
||||
printf("Write replay: range mismatch: %jx-%jx\n", replay_start, replay_end);
|
||||
printf("Write replay: range mismatch: 0x%jx-0x%jx (expected 0-0x10000)\n", replay_start, replay_end);
|
||||
assert(0);
|
||||
}
|
||||
for (auto op: replay_ops)
|
||||
@@ -320,8 +320,6 @@ void test1()
|
||||
check_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_completed(r1);
|
||||
@@ -341,7 +339,7 @@ void test1()
|
||||
pretend_connected(cli, 1);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x2000), 0);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r2);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x1000, 0x1000), 0);
|
||||
|
@@ -21,7 +21,7 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
|
||||
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
|
||||
}
|
||||
|
||||
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
|
||||
tfd = new timerfd_manager_t([this](int fd, int events, std::function<void(int, int)> handler) { set_fd_handler(fd, events, handler); });
|
||||
|
||||
if (ringloop)
|
||||
{
|
||||
@@ -54,14 +54,14 @@ int epoll_manager_t::get_fd()
|
||||
return epoll_fd;
|
||||
}
|
||||
|
||||
void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler)
|
||||
void epoll_manager_t::set_fd_handler(int fd, int events, std::function<void(int, int)> handler)
|
||||
{
|
||||
if (handler != NULL)
|
||||
{
|
||||
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
|
||||
epoll_event ev;
|
||||
ev.data.fd = fd;
|
||||
ev.events = (wr ? EPOLLOUT : 0) | EPOLLIN | EPOLLRDHUP | EPOLLET;
|
||||
ev.events = events | EPOLLRDHUP | EPOLLET;
|
||||
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
|
||||
{
|
||||
if (errno == ENOENT)
|
||||
|
@@ -3,6 +3,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <sys/epoll.h>
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "ringloop.h"
|
||||
@@ -21,7 +23,7 @@ public:
|
||||
epoll_manager_t(ring_loop_t *ringloop);
|
||||
~epoll_manager_t();
|
||||
int get_fd();
|
||||
void set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler);
|
||||
void set_fd_handler(int fd, int events, std::function<void(int, int)> handler);
|
||||
void handle_events(int timeout);
|
||||
|
||||
timerfd_manager_t *tfd;
|
||||
|
@@ -32,12 +32,22 @@ static inline void my_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const s
|
||||
my_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_read(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset);
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset, int buf_index)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset);
|
||||
sqe->buf_index = buf_index;
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_write(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_WRITE, sqe, fd, buf, nbytes, offset);
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_writev(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, unsigned nr_vecs, off_t offset)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
|
||||
|
@@ -151,10 +151,11 @@ static uint64_t size_thresh[] = { (uint64_t)1024*1024*1024*1024, (uint64_t)1024*
|
||||
static uint64_t size_thresh_d[] = { (uint64_t)1000000000000, (uint64_t)1000000000, (uint64_t)1000000, (uint64_t)1000, 0 };
|
||||
static const int size_thresh_n = sizeof(size_thresh)/sizeof(size_thresh[0]);
|
||||
static const char *size_unit = "TGMKB";
|
||||
static const char *size_unit_ns = "TGMk ";
|
||||
|
||||
std::string format_size(uint64_t size, bool nobytes)
|
||||
std::string format_size(uint64_t size, bool nobytes, bool nospace)
|
||||
{
|
||||
uint64_t *thr = nobytes ? size_thresh_d : size_thresh;
|
||||
uint64_t *thr = (nobytes ? size_thresh_d : size_thresh);
|
||||
char buf[256];
|
||||
for (int i = 0; i < size_thresh_n; i++)
|
||||
{
|
||||
@@ -165,9 +166,19 @@ std::string format_size(uint64_t size, bool nobytes)
|
||||
assert(l < sizeof(buf)-2);
|
||||
if (buf[l-1] == '0')
|
||||
l -= 2;
|
||||
buf[l] = i == size_thresh_n-1 && nobytes ? 0 : ' ';
|
||||
buf[l+1] = i == size_thresh_n-1 && nobytes ? 0 : size_unit[i];
|
||||
buf[l+2] = 0;
|
||||
if (i == size_thresh_n-1 && nobytes)
|
||||
buf[l] = 0;
|
||||
else if (nospace)
|
||||
{
|
||||
buf[l] = size_unit_ns[i];
|
||||
buf[l+1] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
buf[l] = ' ';
|
||||
buf[l+1] = size_unit[i];
|
||||
buf[l+2] = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@@ -16,7 +16,7 @@ std::string strtolower(const std::string & in);
|
||||
std::string trim(const std::string & in, const char *rm_chars = " \n\r\t");
|
||||
std::string str_replace(const std::string & in, const std::string & needle, const std::string & replacement);
|
||||
uint64_t stoull_full(const std::string & str, int base = 0);
|
||||
std::string format_size(uint64_t size, bool nobytes = false);
|
||||
std::string format_size(uint64_t size, bool nobytes = false, bool nospace = false);
|
||||
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
|
||||
uint64_t parse_time(std::string time_str, bool *ok = NULL);
|
||||
std::string read_all_fd(int fd);
|
||||
|
@@ -11,7 +11,7 @@
|
||||
#include <stdexcept>
|
||||
#include "timerfd_manager.h"
|
||||
|
||||
timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler)
|
||||
timerfd_manager_t::timerfd_manager_t(std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler)
|
||||
{
|
||||
this->set_fd_handler = set_fd_handler;
|
||||
wait_state = 0;
|
||||
@@ -20,7 +20,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
|
||||
{
|
||||
throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
|
||||
}
|
||||
set_fd_handler(timerfd, false, [this](int fd, int events)
|
||||
set_fd_handler(timerfd, EPOLLIN, [this](int fd, int events)
|
||||
{
|
||||
handle_readable();
|
||||
});
|
||||
@@ -28,7 +28,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
|
||||
|
||||
timerfd_manager_t::~timerfd_manager_t()
|
||||
{
|
||||
set_fd_handler(timerfd, false, NULL);
|
||||
set_fd_handler(timerfd, 0, NULL);
|
||||
close(timerfd);
|
||||
}
|
||||
|
||||
|
@@ -30,9 +30,9 @@ class timerfd_manager_t
|
||||
void trigger_nearest();
|
||||
void handle_readable();
|
||||
public:
|
||||
std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler;
|
||||
std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler;
|
||||
|
||||
timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler);
|
||||
timerfd_manager_t(std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler);
|
||||
~timerfd_manager_t();
|
||||
int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
|
||||
int set_timer_us(uint64_t micros, bool repeat, std::function<void(int)> callback);
|
||||
|
@@ -20,7 +20,6 @@ cd `dirname $0`/..
|
||||
|
||||
trap 'kill -9 $(jobs -p)' EXIT
|
||||
|
||||
ANTIETCD=1
|
||||
ETCD=${ETCD:-etcd}
|
||||
ETCD_IP=${ETCD_IP:-127.0.0.1}
|
||||
ETCD_PORT=${ETCD_PORT:-12379}
|
||||
@@ -33,20 +32,14 @@ if [ "$KEEP_DATA" = "" ]; then
|
||||
fi
|
||||
|
||||
ETCD_URL="http://$ETCD_IP:$ETCD_PORT"
|
||||
ETCD_CLUSTER="etcd1=http://$ETCD_IP:$((ETCD_PORT+1))"
|
||||
for i in $(seq 2 $ETCD_COUNT); do
|
||||
ETCD_URL="$ETCD_URL,http://$ETCD_IP:$((ETCD_PORT+2*i-2))"
|
||||
ETCD_CLUSTER="$ETCD_CLUSTER,etcd$i=http://$ETCD_IP:$((ETCD_PORT+2*i-1))"
|
||||
done
|
||||
ETCDCTL="${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=5s --command-timeout=10s"
|
||||
|
||||
start_etcd()
|
||||
{
|
||||
if [[ "$ANTIETCD" -eq "1" ]]; then
|
||||
start_antietcd $*
|
||||
else
|
||||
start__etcd $*
|
||||
fi
|
||||
}
|
||||
|
||||
start__etcd()
|
||||
{
|
||||
local i=$1
|
||||
local t=/run/user/$(id -u)
|
||||
@@ -60,65 +53,15 @@ start__etcd()
|
||||
eval ETCD${i}_PID=$!
|
||||
}
|
||||
|
||||
start_etcd_cluster()
|
||||
{
|
||||
ETCD_CLUSTER="etcd1=http://$ETCD_IP:$((ETCD_PORT+1))"
|
||||
for i in $(seq 2 $ETCD_COUNT); do
|
||||
ETCD_CLUSTER="$ETCD_CLUSTER,etcd$i=http://$ETCD_IP:$((ETCD_PORT+2*i-1))"
|
||||
done
|
||||
for i in $(seq 1 $ETCD_COUNT); do
|
||||
start__etcd $i
|
||||
done
|
||||
ETCDCTL="${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=5s --command-timeout=10s"
|
||||
for i in {1..30}; do
|
||||
${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=1s --command-timeout=1s member list >/dev/null && break
|
||||
if [[ $i = 30 ]]; then
|
||||
format_error "Failed to start etcd"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
start_antietcd()
|
||||
{
|
||||
local i=$1
|
||||
local t=/run/user/$(id -u)
|
||||
findmnt $t >/dev/null || (sudo mkdir -p $t && sudo mount -t tmpfs tmpfs $t)
|
||||
local persist=""
|
||||
if [[ -n "$ANTIETCD_PERSISTENCE" ]]; then
|
||||
persist="--data ./testdata/antietcd$i.json.gz --persist_interval 500"
|
||||
for i in $(seq 1 $ETCD_COUNT); do
|
||||
start_etcd $i
|
||||
done
|
||||
for i in {1..30}; do
|
||||
${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=1s --command-timeout=1s member list >/dev/null && break
|
||||
if [[ $i = 30 ]]; then
|
||||
format_error "Failed to start etcd"
|
||||
fi
|
||||
local cluster=""
|
||||
if [[ $ETCD_COUNT -gt 1 ]]; then
|
||||
cluster="--node_id etcd$i --cluster_key abcdef --cluster $ETCD_CLUSTER"
|
||||
fi
|
||||
nodejs mon/tinyraft/antietcd-app.js $persist --port $((ETCD_PORT+2*i-2)) $cluster &>./testdata/antietcd$i.log &
|
||||
eval ETCD${i}_PID=$!
|
||||
}
|
||||
|
||||
start_antietcd_cluster()
|
||||
{
|
||||
ETCD_CLUSTER="etcd1=http://$ETCD_IP:$((ETCD_PORT))"
|
||||
for i in $(seq 2 $ETCD_COUNT); do
|
||||
ETCD_CLUSTER="$ETCD_CLUSTER,etcd$i=http://$ETCD_IP:$((ETCD_PORT+2*i-2))"
|
||||
done
|
||||
for i in $(seq 1 $ETCD_COUNT); do
|
||||
start_antietcd $i
|
||||
done
|
||||
ETCDCTL="nodejs mon/tinyraft/anticli.js -e $ETCD_URL"
|
||||
for i in {1..30}; do
|
||||
nodejs mon/tinyraft/anticli.js -e "$ETCD_URL" get --prefix / && break
|
||||
if [[ $i = 30 ]]; then
|
||||
format_error "Failed to start antietcd"
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
if [[ "$ANTIETCD" -eq "1" ]]; then
|
||||
start_antietcd_cluster
|
||||
else
|
||||
start_etcd_cluster
|
||||
fi
|
||||
done
|
||||
|
||||
echo leak:fio >> testdata/lsan-suppress.txt
|
||||
echo leak:tcmalloc >> testdata/lsan-suppress.txt
|
||||
|
@@ -54,7 +54,7 @@ for i in $(seq 1 $OSD_COUNT); do
|
||||
start_osd $i
|
||||
done
|
||||
|
||||
(while true; do set +e; node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1; if [[ $? -ne 2 ]]; then break; fi; done) >>./testdata/mon.log 2>&1 &
|
||||
node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 >>./testdata/mon.log 2>&1 &
|
||||
MON_PID=$!
|
||||
|
||||
if [ "$SCHEME" = "ec" ]; then
|
||||
|
@@ -15,7 +15,7 @@ for i in $(seq 1 $OSD_COUNT); do
|
||||
eval OSD${i}_PID=$!
|
||||
done
|
||||
|
||||
(while true; do node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 || true; done) >>./testdata/mon.log 2>&1 &
|
||||
node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 >>./testdata/mon.log 2>&1 &
|
||||
MON_PID=$!
|
||||
|
||||
sleep 3
|
||||
|
Reference in New Issue
Block a user