forked from vitalif/vitastor
Compare commits
2 Commits
hugo-docs
...
zerocopy-t
Author | SHA1 | Date | |
---|---|---|---|
385aca9d44 | |||
2697aae909 |
@@ -1,55 +0,0 @@
|
|||||||
#!/usr/bin/nodejs
|
|
||||||
|
|
||||||
const fs = require('fs');
|
|
||||||
const yaml = require('yaml');
|
|
||||||
|
|
||||||
const L = {
|
|
||||||
en: {},
|
|
||||||
ru: {
|
|
||||||
Type: 'Тип',
|
|
||||||
Default: 'Значение по умолчанию',
|
|
||||||
Minimum: 'Минимальное значение',
|
|
||||||
},
|
|
||||||
};
|
|
||||||
const types = {
|
|
||||||
en: {
|
|
||||||
string: 'string',
|
|
||||||
bool: 'boolean',
|
|
||||||
int: 'integer',
|
|
||||||
sec: 'seconds',
|
|
||||||
ms: 'milliseconds',
|
|
||||||
us: 'microseconds',
|
|
||||||
},
|
|
||||||
ru: {
|
|
||||||
string: 'строка',
|
|
||||||
bool: 'булево (да/нет)',
|
|
||||||
int: 'целое число',
|
|
||||||
sec: 'секунды',
|
|
||||||
ms: 'миллисекунды',
|
|
||||||
us: 'микросекунды',
|
|
||||||
},
|
|
||||||
};
|
|
||||||
const params_files = fs.readdirSync(__dirname+'/params')
|
|
||||||
.filter(f => f.substr(-4) == '.yml')
|
|
||||||
.map(f => f.substr(0, f.length-4));
|
|
||||||
|
|
||||||
for (const file of params_files)
|
|
||||||
{
|
|
||||||
const cfg = yaml.parse(fs.readFileSync(__dirname+'/params/'+file+'.yml', { encoding: 'utf-8' }));
|
|
||||||
for (const lang in types)
|
|
||||||
{
|
|
||||||
let out = '\n\n{{< toc >}}';
|
|
||||||
for (const c of cfg)
|
|
||||||
{
|
|
||||||
out += `\n\n## ${c.name}\n\n`;
|
|
||||||
out += `- ${L[lang]['Type'] || 'Type'}: ${c["type_"+lang] || types[lang][c.type] || c.type}\n`;
|
|
||||||
if (c.default !== undefined)
|
|
||||||
out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`;
|
|
||||||
if (c.min !== undefined)
|
|
||||||
out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`;
|
|
||||||
out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, '');
|
|
||||||
}
|
|
||||||
const head = fs.readFileSync(__dirname+'/params/head/'+file+'.'+lang+'.md', { encoding: 'utf-8' });
|
|
||||||
fs.writeFileSync(__dirname+'/hugo/content/config/'+file+'.'+lang+'.md', head.replace(/\s+$/, '')+out+"\n");
|
|
||||||
}
|
|
||||||
}
|
|
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
title: "{{ replace .Name "-" " " | title }}"
|
|
||||||
date: {{ .Date }}
|
|
||||||
draft: true
|
|
||||||
---
|
|
||||||
|
|
@@ -1,35 +0,0 @@
|
|||||||
baseURL: http://localhost
|
|
||||||
title: Vitastor
|
|
||||||
theme: hugo-geekdoc
|
|
||||||
#languageCode: en-us
|
|
||||||
|
|
||||||
pluralizeListTitles: false
|
|
||||||
|
|
||||||
# Geekdoc required configuration
|
|
||||||
pygmentsUseClasses: true
|
|
||||||
pygmentsCodeFences: true
|
|
||||||
disablePathToLower: true
|
|
||||||
|
|
||||||
# Required if you want to render robots.txt template
|
|
||||||
enableRobotsTXT: true
|
|
||||||
|
|
||||||
defaultContentLanguage: en
|
|
||||||
languages:
|
|
||||||
en:
|
|
||||||
weight: 1
|
|
||||||
languageName: English
|
|
||||||
ru:
|
|
||||||
weight: 1
|
|
||||||
languageName: Русский
|
|
||||||
|
|
||||||
markup:
|
|
||||||
goldmark:
|
|
||||||
renderer:
|
|
||||||
# Needed for mermaid shortcode
|
|
||||||
unsafe: true
|
|
||||||
tableOfContents:
|
|
||||||
startLevel: 1
|
|
||||||
endLevel: 9
|
|
||||||
|
|
||||||
taxonomies:
|
|
||||||
tag: tags
|
|
@@ -1,6 +0,0 @@
|
|||||||
## The Idea
|
|
||||||
|
|
||||||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
|
|
||||||
architecturally similar to Ceph which means strong consistency, primary-replication,
|
|
||||||
symmetric clustering and automatic data distribution over any number of drives
|
|
||||||
of any size with configurable redundancy (replication or erasure codes/XOR).
|
|
@@ -1,61 +0,0 @@
|
|||||||
---
|
|
||||||
title: Parameter Reference
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
Vitastor configuration consists of:
|
|
||||||
- Configuration parameters (key-value), described here
|
|
||||||
- [Pool configuration]({{< ref "config/pool" >}})
|
|
||||||
- OSD placement tree configuration
|
|
||||||
- Inode configuration i.e. image metadata like name, size and parent reference
|
|
||||||
|
|
||||||
Configuration parameters can be set in 3 places:
|
|
||||||
- Configuration file (`/etc/vitastor/vitastor.conf` or other path)
|
|
||||||
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd
|
|
||||||
connection parameters should obviously be set in the configuration file.
|
|
||||||
- Command line of Vitastor components: OSD, mon, fio and QEMU options,
|
|
||||||
OpenStack/Proxmox/etc configuration. The latter doesn't allow to set all
|
|
||||||
variables directly, but it allows to override the configuration file and
|
|
||||||
set everything you need inside it.
|
|
||||||
|
|
||||||
In the future, additional configuration methods may be added:
|
|
||||||
- OSD superblock which will, by design, contain parameters related to the disk
|
|
||||||
layout and to one specific OSD.
|
|
||||||
- OSD-specific keys in etcd like `/vitastor/config/osd/<number>`.
|
|
||||||
|
|
||||||
## Common Parameters
|
|
||||||
|
|
||||||
These are the most common parameters which apply to all components of Vitastor.
|
|
||||||
|
|
||||||
[See the list]({{< ref "common" >}})
|
|
||||||
|
|
||||||
## Cluster-Wide Disk Layout Parameters
|
|
||||||
|
|
||||||
These parameters apply to clients and OSDs and can't be changed after OSD
|
|
||||||
initialization.
|
|
||||||
|
|
||||||
[See the list]({{< ref "layout-cluster" >}})
|
|
||||||
|
|
||||||
## OSD Disk Layout Parameters
|
|
||||||
|
|
||||||
These parameters apply to OSDs and can't be changed after OSD initialization.
|
|
||||||
|
|
||||||
[See the list]({{< ref "layout-osd" >}})
|
|
||||||
|
|
||||||
## Network Protocol Parameters
|
|
||||||
|
|
||||||
These parameters apply to clients and OSDs and can be changed with a restart.
|
|
||||||
|
|
||||||
[See the list]({{< ref "network" >}})
|
|
||||||
|
|
||||||
## Runtime OSD Parameters
|
|
||||||
|
|
||||||
These parameters apply to OSDs and can be changed with an OSD restart.
|
|
||||||
|
|
||||||
[See the list]({{< ref "osd" >}})
|
|
||||||
|
|
||||||
## Monitor Parameters
|
|
||||||
|
|
||||||
These parameters only apply to Monitors.
|
|
||||||
|
|
||||||
[See the list]({{< ref "monitor" >}})
|
|
@@ -1,63 +0,0 @@
|
|||||||
---
|
|
||||||
title: Перечень настроек
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
Конфигурация Vitastor состоит из:
|
|
||||||
- Параметров (ключ-значение), описанных на данной странице
|
|
||||||
- Настроек пулов
|
|
||||||
- Настроек дерева OSD
|
|
||||||
- Настроек инодов, т.е. метаданных образов, таких, как имя, размер и ссылки на
|
|
||||||
родительский образ
|
|
||||||
|
|
||||||
Параметры конфигурации могут задаваться в 3 местах:
|
|
||||||
- Файле конфигурации (`/etc/vitastor/vitastor.conf` или по другому пути)
|
|
||||||
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может
|
|
||||||
задаваться там, кроме, естественно, самих параметров соединения с etcd,
|
|
||||||
которые должны задаваться в файле конфигурации
|
|
||||||
- В командной строке компонентов Vitastor: OSD, монитора, опциях fio и QEMU,
|
|
||||||
настроек OpenStack, Proxmox и т.п. Последние, как правило, не включают полный
|
|
||||||
набор параметров напрямую, но разрешают определить путь к файлу конфигурации
|
|
||||||
и задать любые параметры в нём.
|
|
||||||
|
|
||||||
В будущем также могут быть добавлены другие способы конфигурации:
|
|
||||||
- Суперблок OSD, в котором будут храниться параметры OSD, связанные с дисковым
|
|
||||||
форматом и с этим конкретным OSD.
|
|
||||||
- OSD-специфичные ключи в etcd типа `/vitastor/config/osd/<номер>`.
|
|
||||||
|
|
||||||
## Общие параметры
|
|
||||||
|
|
||||||
Это наиболее общие параметры, используемые всеми компонентами Vitastor.
|
|
||||||
|
|
||||||
[Посмотреть список]({{< ref "common" >}})
|
|
||||||
|
|
||||||
## Дисковые параметры уровня кластера
|
|
||||||
|
|
||||||
Эти параметры используются клиентами и OSD и не могут быть изменены после
|
|
||||||
инициализации OSD.
|
|
||||||
|
|
||||||
[Посмотреть список]({{< ref "layout-cluster" >}})
|
|
||||||
|
|
||||||
## Дисковые параметры OSD
|
|
||||||
|
|
||||||
Эти параметры используются OSD и не могут быть изменены после инициализации OSD.
|
|
||||||
|
|
||||||
[Посмотреть список]({{< ref "layout-osd" >}})
|
|
||||||
|
|
||||||
## Параметры сетевого протокола
|
|
||||||
|
|
||||||
Эти параметры используются клиентами и OSD и могут быть изменены с перезапуском.
|
|
||||||
|
|
||||||
[Посмотреть список]({{< ref "network" >}})
|
|
||||||
|
|
||||||
## Изменяемые параметры OSD
|
|
||||||
|
|
||||||
Эти параметры используются OSD и могут быть изменены с перезапуском.
|
|
||||||
|
|
||||||
[Посмотреть список]({{< ref "osd" >}})
|
|
||||||
|
|
||||||
## Параметры мониторов
|
|
||||||
|
|
||||||
Данные параметры используются только мониторами Vitastor.
|
|
||||||
|
|
||||||
[Посмотреть список]({{< ref "monitor" >}})
|
|
@@ -1,178 +0,0 @@
|
|||||||
---
|
|
||||||
title: Pool configuration
|
|
||||||
weight: 100
|
|
||||||
---
|
|
||||||
|
|
||||||
Pool configuration is set in etcd key `/vitastor/config/pools` in the following
|
|
||||||
JSON format:
|
|
||||||
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"<Numeric ID>": {
|
|
||||||
"name": "<name>",
|
|
||||||
...other parameters...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
{{< toc >}}
|
|
||||||
|
|
||||||
# Parameters
|
|
||||||
|
|
||||||
## name
|
|
||||||
|
|
||||||
- Type: string
|
|
||||||
- Required
|
|
||||||
|
|
||||||
Pool name.
|
|
||||||
|
|
||||||
## scheme
|
|
||||||
|
|
||||||
- Type: string
|
|
||||||
- Required
|
|
||||||
- One of: "replicated", "xor" or "jerasure"
|
|
||||||
|
|
||||||
Redundancy scheme used for data in this pool.
|
|
||||||
|
|
||||||
## pg_size
|
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
- Required
|
|
||||||
|
|
||||||
Total number of disks for PGs of this pool - i.e., number of replicas for
|
|
||||||
replicated pools and number of data plus parity disks for EC/XOR pools.
|
|
||||||
|
|
||||||
## parity_chunks
|
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
|
|
||||||
Number of parity chunks for EC/XOR pools. For such pools, data will be lost
|
|
||||||
if you lose more than parity_chunks disks at once, so this parameter can be
|
|
||||||
equally described as FTT (number of failures to tolerate).
|
|
||||||
|
|
||||||
Required for EC/XOR pools, ignored for replicated pools.
|
|
||||||
|
|
||||||
## pg_minsize
|
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
- Required
|
|
||||||
|
|
||||||
Number of available live disks for PGs of this pool to remain active.
|
|
||||||
That is, if it becomes impossible to place PG data on at least (pg_minsize)
|
|
||||||
OSDs, PG is deactivated for both read and write. So you know that a fresh
|
|
||||||
write always goes to at least (pg_minsize) OSDs (disks).
|
|
||||||
|
|
||||||
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
|
|
||||||
read-only instead of deactivating them.
|
|
||||||
|
|
||||||
## pg_count
|
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
- Required
|
|
||||||
|
|
||||||
Number of PGs for this pool. The value should be big enough for the monitor /
|
|
||||||
LP solver to be able to optimize data placement.
|
|
||||||
|
|
||||||
"Enough" is usually around 64-128 PGs per OSD, i.e. you set pg_count for pool
|
|
||||||
to (total OSD count * 100 / pg_size). You can round it to the closest power of 2,
|
|
||||||
because it makes it easier to reduce or increase PG count later by dividing or
|
|
||||||
multiplying it by 2.
|
|
||||||
|
|
||||||
In Vitastor, PGs are ephemeral, so you can change pool PG count anytime just
|
|
||||||
by overwriting pool configuration in etcd. Amount of the data affected by
|
|
||||||
rebalance will be smaller if the new PG count is a multiple of the old PG count
|
|
||||||
or vice versa.
|
|
||||||
|
|
||||||
## failure_domain
|
|
||||||
|
|
||||||
- Type: string
|
|
||||||
- Default: host
|
|
||||||
|
|
||||||
Failure domain specification. Must be "host" or "osd" or refer to one of the
|
|
||||||
placement tree levels, defined in [placement_levels]({{< ref "config/monitor#placement_levels" >}}).
|
|
||||||
|
|
||||||
Two replicas, or two parts in case of EC/XOR, of the same block of data are
|
|
||||||
never put on OSDs in the same failure domain (for example, on the same host).
|
|
||||||
So failure domain specifies the unit which failure you are protecting yourself
|
|
||||||
from.
|
|
||||||
|
|
||||||
## max_osd_combinations
|
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
- Default: 10000
|
|
||||||
|
|
||||||
Vitastor data placement algorithm is based on the LP solver and OSD combinations
|
|
||||||
which are fed to it are generated ramdonly. This parameter specifies the maximum
|
|
||||||
number of combinations to generate when optimising PG placement.
|
|
||||||
|
|
||||||
This parameter usually doesn't require to be changed.
|
|
||||||
|
|
||||||
## pg_stripe_size
|
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
- Default: 0
|
|
||||||
|
|
||||||
Specifies the stripe size for this pool according to which images are split into
|
|
||||||
different PGs. Stripe size can't be smaller than [block_size]({{< ref "config/layout-cluster#block_size" >}})
|
|
||||||
multiplied by (pg_size - parity_chunks) for EC/XOR pools, or 1 for replicated pools,
|
|
||||||
and the same value is used by default.
|
|
||||||
|
|
||||||
This means first `pg_stripe_size = (block_size * (pg_size-parity_chunks))` bytes
|
|
||||||
of an image go to one PG, next `pg_stripe_size` bytes go to another PG and so on.
|
|
||||||
|
|
||||||
Usually doesn't require to be changed separately from the block size.
|
|
||||||
|
|
||||||
## root_node
|
|
||||||
|
|
||||||
- Type: string
|
|
||||||
|
|
||||||
Specifies the root node of the OSD tree to restrict this pool OSDs to.
|
|
||||||
Referenced root node must exist in /vitastor/config/node_placement.
|
|
||||||
|
|
||||||
## osd_tags
|
|
||||||
|
|
||||||
- Type: string or array of strings
|
|
||||||
|
|
||||||
Specifies OSD tags to restrict this pool to. If multiple tags are specified,
|
|
||||||
only OSDs having all of these tags will be used for this pool.
|
|
||||||
|
|
||||||
## primary_affinity_tags
|
|
||||||
|
|
||||||
- Type: string or array of strings
|
|
||||||
|
|
||||||
Specifies OSD tags to prefer putting primary OSDs in this pool to.
|
|
||||||
Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one
|
|
||||||
of the OSDs containing a data chunk for a PG.
|
|
||||||
|
|
||||||
# Examples
|
|
||||||
|
|
||||||
## Replicated pool
|
|
||||||
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"1": {
|
|
||||||
"name":"testpool",
|
|
||||||
"scheme":"replicated",
|
|
||||||
"pg_size":2,
|
|
||||||
"pg_minsize":1,
|
|
||||||
"pg_count":256,
|
|
||||||
"failure_domain":"host"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Erasure-coded pool
|
|
||||||
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"2": {
|
|
||||||
"name":"ecpool",
|
|
||||||
"scheme":"jerasure",
|
|
||||||
"pg_size":3,
|
|
||||||
"parity_chunks":1,
|
|
||||||
"pg_minsize":2,
|
|
||||||
"pg_count":256,
|
|
||||||
"failure_domain":"host"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
@@ -1,41 +0,0 @@
|
|||||||
---
|
|
||||||
title: Packages
|
|
||||||
weight: 2
|
|
||||||
---
|
|
||||||
|
|
||||||
## Debian
|
|
||||||
|
|
||||||
- Trust Vitastor package signing key:
|
|
||||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
|
||||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
|
||||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
|
||||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
|
||||||
- For Debian 10 (Buster) also enable backports repository:
|
|
||||||
`deb http://deb.debian.org/debian buster-backports main`
|
|
||||||
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
|
|
||||||
|
|
||||||
## CentOS
|
|
||||||
|
|
||||||
- Add Vitastor package repository:
|
|
||||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
|
||||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
|
||||||
- Enable EPEL: `yum/dnf install epel-release`
|
|
||||||
- Enable additional CentOS repositories:
|
|
||||||
- CentOS 7: `yum install centos-release-scl`
|
|
||||||
- CentOS 8: `dnf install centos-release-advanced-virtualization`
|
|
||||||
- Enable elrepo-kernel:
|
|
||||||
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
|
|
||||||
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
|
|
||||||
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
|
|
||||||
|
|
||||||
## Installation requirements
|
|
||||||
|
|
||||||
- Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly
|
|
||||||
recommended because io_uring is a relatively new technology and there is
|
|
||||||
at least one bug which reproduces with io_uring and HP SmartArray
|
|
||||||
controllers in 5.4
|
|
||||||
- liburing 0.4 or newer
|
|
||||||
- lp_solve
|
|
||||||
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
|
|
||||||
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
|
||||||
- node.js 10 or newer
|
|
@@ -1,72 +0,0 @@
|
|||||||
---
|
|
||||||
title: Quick Start
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
Prepare:
|
|
||||||
|
|
||||||
- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs
|
|
||||||
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
|
|
||||||
[here]({{< ref "config/layout-cluster#immediate_commit" >}}).
|
|
||||||
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
|
|
||||||
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
|
||||||
- [Install Vitastor packages]({{< ref "installation/packages" >}}).
|
|
||||||
|
|
||||||
## Configure monitors
|
|
||||||
|
|
||||||
On the monitor hosts:
|
|
||||||
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values.
|
|
||||||
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
|
|
||||||
- Start etcd and monitors: `systemctl start etcd vitastor-mon`
|
|
||||||
|
|
||||||
## Configure OSDs
|
|
||||||
|
|
||||||
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
|
|
||||||
"osd_network": "10.200.1.0/24"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- Initialize OSDs:
|
|
||||||
- Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
|
||||||
- Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` — pass all your
|
|
||||||
devices (HDD and SSD) to this script — it will partition disks and initialize journals on its own.
|
|
||||||
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
|
|
||||||
Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped,
|
|
||||||
but some free unpartitioned space must be available because the script creates new partitions for journals.
|
|
||||||
- You can change OSD configuration in units or in `vitastor.conf`.
|
|
||||||
Check [Configuration Reference]({{< ref "config" >}}) for parameter descriptions.
|
|
||||||
- `systemctl start vitastor.target` everywhere.
|
|
||||||
- If all your drives have capacitors, create global configuration in etcd: \
|
|
||||||
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
|
||||||
|
|
||||||
## Create a pool
|
|
||||||
|
|
||||||
Create pool configuration in etcd:
|
|
||||||
|
|
||||||
```
|
|
||||||
etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
|
|
||||||
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
|
|
||||||
```
|
|
||||||
|
|
||||||
For jerasure pools the configuration should look like the following:
|
|
||||||
|
|
||||||
```
|
|
||||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
|
||||||
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
|
|
||||||
```
|
|
||||||
|
|
||||||
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
|
||||||
|
|
||||||
You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
|
|
||||||
|
|
||||||
## Create an image
|
|
||||||
|
|
||||||
Use vitastor-cli ([read CLI documentation here]({{< ref "usage/cli" >}})):
|
|
||||||
|
|
||||||
```
|
|
||||||
vitastor-cli create -s 10G testimg
|
|
||||||
```
|
|
||||||
|
|
||||||
After that, you can run benchmarks or start QEMU manually with this image.
|
|
@@ -1,54 +0,0 @@
|
|||||||
---
|
|
||||||
title: Building from Source
|
|
||||||
weight: 3
|
|
||||||
---
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
|
|
||||||
designated initializers support from C++20
|
|
||||||
- CMake
|
|
||||||
- liburing, jerasure headers
|
|
||||||
|
|
||||||
## Basic instructions
|
|
||||||
|
|
||||||
Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
|
|
||||||
|
|
||||||
Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
|
|
||||||
you can disable it by passing `-DWITH_FIO=no` to cmake.
|
|
||||||
|
|
||||||
Build and install Vitastor:
|
|
||||||
|
|
||||||
```
|
|
||||||
cd vitastor
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
cmake .. && make -j8 install
|
|
||||||
```
|
|
||||||
|
|
||||||
## QEMU Driver
|
|
||||||
|
|
||||||
It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
|
|
||||||
QEMU build process. To do that:
|
|
||||||
- Install vitastor client library headers (from source or from vitastor-client-dev package)
|
|
||||||
- Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
|
|
||||||
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
|
|
||||||
- Build QEMU as usual
|
|
||||||
|
|
||||||
But it is also possible to build it out-of-tree. To do that:
|
|
||||||
- Get QEMU source, begin to build it, stop the build and copy headers:
|
|
||||||
- `<qemu>/include` → `<vitastor>/qemu/include`
|
|
||||||
- Debian:
|
|
||||||
* Use qemu packages from the main repository
|
|
||||||
* `<qemu>/b/qemu/config-host.h` → `<vitastor>/qemu/b/qemu/config-host.h`
|
|
||||||
* `<qemu>/b/qemu/qapi` → `<vitastor>/qemu/b/qemu/qapi`
|
|
||||||
- CentOS 8:
|
|
||||||
* Use qemu packages from the Advanced-Virtualization repository. To enable it, run
|
|
||||||
`yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
|
|
||||||
* `<qemu>/config-host.h` → `<vitastor>/qemu/b/qemu/config-host.h`
|
|
||||||
* For QEMU 3.0+: `<qemu>/qapi` → `<vitastor>/qemu/b/qemu/qapi`
|
|
||||||
* For QEMU 2.0+: `<qemu>/qapi-types.h` → `<vitastor>/qemu/b/qemu/qapi-types.h`
|
|
||||||
- `config-host.h` and `qapi` are required because they contain generated headers
|
|
||||||
- Configure Vitastor with `WITH_QEMU=yes` and, if you're on RHEL, also with `QEMU_PLUGINDIR=qemu-kvm`:
|
|
||||||
`cmake .. -DWITH_QEMU=yes`.
|
|
||||||
- After that, Vitastor will build `block-vitastor.so` during its build process.
|
|
@@ -1,4 +0,0 @@
|
|||||||
---
|
|
||||||
title: Introduction
|
|
||||||
weight: -1
|
|
||||||
---
|
|
@@ -1,73 +0,0 @@
|
|||||||
---
|
|
||||||
title: Architecture
|
|
||||||
weight: 3
|
|
||||||
---
|
|
||||||
|
|
||||||
For people familiar with Ceph, Vitastor is quite similar:
|
|
||||||
|
|
||||||
- Vitastor also has Pools, PGs, OSDs, Monitors, Failure Domains, Placement Tree:
|
|
||||||
- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
|
|
||||||
- PG (Placement Group) is a container for data that (normally) shares the same replicas.
|
|
||||||
- Pool is a container for data that has the same redundancy scheme and placement rules.
|
|
||||||
- Monitor is a separate daemon that watches cluster state and controls data distribution.
|
|
||||||
- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
|
|
||||||
- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
|
|
||||||
- Vitastor also distributes every image data across the whole cluster.
|
|
||||||
- Vitastor is also transactional (every write to the cluster is atomic).
|
|
||||||
- OSDs also have journal and metadata and they can also be put on separate drives.
|
|
||||||
- Just like in Ceph, client library attempts to recover from any cluster failure so
|
|
||||||
you can basically reboot the whole cluster and only pause, but not crash, your clients
|
|
||||||
(please report a bug if the client crashes in that case).
|
|
||||||
|
|
||||||
However, there are also differences:
|
|
||||||
|
|
||||||
- Vitastor's main focus is on SSDs. Hybrid SSD+HDD setups are also possible.
|
|
||||||
- Vitastor OSD is (and will always be) single-threaded. If you want to dedicate more than 1 core
|
|
||||||
per drive you should run multiple OSDs each on a different partition of the drive.
|
|
||||||
Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases.
|
|
||||||
- Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity
|
|
||||||
and data store block size which is 128 KB by default. With 128 KB blocks metadata should occupy
|
|
||||||
around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big,
|
|
||||||
the example test below was conducted with only 16 MB journal. A big journal is probably even
|
|
||||||
harmful as dirty write metadata also take some memory.
|
|
||||||
- Vitastor storage layer doesn't have internal copy-on-write or redirect-write. I know that maybe
|
|
||||||
it's possible to create a good copy-on-write storage, but it's much harder and makes performance
|
|
||||||
less deterministic, so CoW isn't used in Vitastor.
|
|
||||||
- The basic layer of Vitastor is block storage with fixed-size blocks, not object storage with
|
|
||||||
rich semantics like in Ceph (RADOS).
|
|
||||||
- There's a "lazy fsync" mode which allows to batch writes before flushing them to the disk.
|
|
||||||
This allows to use Vitastor with desktop SSDs, but still lowers performance due to additional
|
|
||||||
network roundtrips, so use server SSDs with capacitor-based power loss protection
|
|
||||||
("Advanced Power Loss Protection") for best performance.
|
|
||||||
- PGs are ephemeral. This means that they aren't stored on data disks and only exist in memory
|
|
||||||
while OSDs are running.
|
|
||||||
- Recovery process is per-object (per-block), not per-PG. Also there are no PGLOGs.
|
|
||||||
- Monitors don't store data. Cluster configuration and state is stored in etcd in simple human-readable
|
|
||||||
JSON structures. Monitors only watch cluster state and handle data movement.
|
|
||||||
Thus Vitastor's Monitor isn't a critical component of the system and is more similar to Ceph's Manager.
|
|
||||||
Vitastor's Monitor is implemented in node.js.
|
|
||||||
- PG distribution isn't based on consistent hashes. All PG mappings are stored in etcd.
|
|
||||||
Rebalancing PGs between OSDs is done by mathematical optimization - data distribution problem
|
|
||||||
is reduced to a linear programming problem and solved by lp_solve. This allows for almost
|
|
||||||
perfect (96-99% uniformity compared to Ceph's 80-90%) data distribution in most cases, ability
|
|
||||||
to map PGs by hand without breaking rebalancing logic, reduced OSD peer-to-peer communication
|
|
||||||
(on average, OSDs have fewer peers) and less data movement. It also probably has a drawback -
|
|
||||||
this method may fail in very large clusters, but up to several hundreds of OSDs it's perfectly fine.
|
|
||||||
It's also easy to add consistent hashes in the future if something proves their necessity.
|
|
||||||
- There's no separate CRUSH layer. You select pool redundancy scheme, placement root, failure domain
|
|
||||||
and so on directly in pool configuration.
|
|
||||||
- Images are global i.e. you can't create multiple images with the same name in different pools.
|
|
||||||
|
|
||||||
## Implementation Principles
|
|
||||||
|
|
||||||
- I like architecturally simple solutions. Vitastor is and will always be designed
|
|
||||||
exactly like that.
|
|
||||||
- I also like reinventing the wheel to some extent, like writing my own HTTP client
|
|
||||||
for etcd interaction instead of using prebuilt libraries, because in this case
|
|
||||||
I'm confident about what my code does and what it doesn't do.
|
|
||||||
- I don't care about C++ "best practices" like RAII or proper inheritance or usage of
|
|
||||||
smart pointers or whatever and I don't intend to change my mind, so if you're here
|
|
||||||
looking for ideal reference C++ code, this probably isn't the right place.
|
|
||||||
- I like node.js better than any other dynamically-typed language interpreter
|
|
||||||
because it's faster than any other interpreter in the world, has neutral C-like
|
|
||||||
syntax and built-in event loop. That's why Monitor is implemented in node.js.
|
|
@@ -1,34 +0,0 @@
|
|||||||
---
|
|
||||||
title: Author and License
|
|
||||||
weight: 3
|
|
||||||
---
|
|
||||||
|
|
||||||
Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
|
|
||||||
|
|
||||||
Join Vitastor Telegram Chat: https://t.me/vitastor
|
|
||||||
|
|
||||||
All server-side code (OSD, Monitor and so on) is licensed under the terms of
|
|
||||||
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
|
|
||||||
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
|
|
||||||
opensourcing all programs directly or indirectly interacting with Vitastor
|
|
||||||
through a computer network and expressly designed to be used in conjunction
|
|
||||||
with it ("Proxy Programs"). Proxy Programs may be made public not only under
|
|
||||||
the terms of the same license, but also under the terms of any GPL-Compatible
|
|
||||||
Free Software License, as listed by the Free Software Foundation.
|
|
||||||
This is a stricter copyleft license than the Affero GPL.
|
|
||||||
|
|
||||||
Please note that VNPL doesn't require you to open the code of proprietary
|
|
||||||
software running inside a VM if it's not specially designed to be used with
|
|
||||||
Vitastor.
|
|
||||||
|
|
||||||
Basically, you can't use the software in a proprietary environment to provide
|
|
||||||
its functionality to users without opensourcing all intermediary components
|
|
||||||
standing between the user and Vitastor or purchasing a commercial license
|
|
||||||
from the author 😀.
|
|
||||||
|
|
||||||
Client libraries (cluster_client and so on) are dual-licensed under the same
|
|
||||||
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
|
||||||
software like QEMU and fio.
|
|
||||||
|
|
||||||
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
|
|
||||||
GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).
|
|
@@ -1,60 +0,0 @@
|
|||||||
---
|
|
||||||
title: Features
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
Vitastor is currently a pre-release and it still misses some important features.
|
|
||||||
However, the following is implemented:
|
|
||||||
|
|
||||||
- Basic part: highly-available block storage with symmetric clustering and no SPOF
|
|
||||||
- Performance ;-D
|
|
||||||
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
|
|
||||||
based on jerasure library with any number of data and parity drives in a group
|
|
||||||
- Configuration via simple JSON data structures in etcd (parameters, pools and images)
|
|
||||||
- Automatic data distribution over OSDs, with support for:
|
|
||||||
- Mathematical optimization for better uniformity and less data movement
|
|
||||||
- Multiple pools
|
|
||||||
- Placement tree, OSD selection by tags (device classes) and placement root
|
|
||||||
- Configurable failure domains
|
|
||||||
- Recovery of degraded blocks
|
|
||||||
- Rebalancing (data movement between OSDs)
|
|
||||||
- Lazy fsync support
|
|
||||||
- Per-OSD and per-image I/O and space usage statistics in etcd
|
|
||||||
- Snapshots and copy-on-write image clones
|
|
||||||
- Write throttling to smooth random write workloads in SSD+HDD configurations
|
|
||||||
- RDMA/RoCEv2 support via libibverbs
|
|
||||||
|
|
||||||
CLI (vitastor-cli):
|
|
||||||
- Pool listing and space stats (df)
|
|
||||||
- Image listing, space and I/O stats (ls)
|
|
||||||
- Image and snapshot creation (create, modify)
|
|
||||||
- Image removal and snapshot merge (rm, flatten, merge, rm-data)
|
|
||||||
|
|
||||||
Plugins and packaging:
|
|
||||||
- Debian and CentOS packages
|
|
||||||
- Generic user-space client library
|
|
||||||
- Native QEMU driver
|
|
||||||
- Loadable fio engine for benchmarks
|
|
||||||
- NBD proxy for kernel mounts
|
|
||||||
- CSI plugin for Kubernetes
|
|
||||||
- OpenStack support: Cinder driver, Nova and libvirt patches
|
|
||||||
- Proxmox storage plugin and packages
|
|
||||||
|
|
||||||
## Roadmap
|
|
||||||
|
|
||||||
The following features are planned for the future:
|
|
||||||
|
|
||||||
- Better OSD creation and auto-start tools
|
|
||||||
- Other administrative tools
|
|
||||||
- Web GUI
|
|
||||||
- OpenNebula plugin
|
|
||||||
- iSCSI proxy
|
|
||||||
- Simplified NFS proxy
|
|
||||||
- Multi-threaded client
|
|
||||||
- Faster failover
|
|
||||||
- Scrubbing without checksums (verification of replicas)
|
|
||||||
- Checksums
|
|
||||||
- Tiered storage (SSD caching)
|
|
||||||
- NVDIMM support
|
|
||||||
- Compression (possibly)
|
|
||||||
- Read caching using system page cache (possibly)
|
|
@@ -1,93 +0,0 @@
|
|||||||
---
|
|
||||||
title: Example Comparison with Ceph
|
|
||||||
weight: 4
|
|
||||||
---
|
|
||||||
|
|
||||||
Hardware configuration: 4 nodes, each with:
|
|
||||||
- 6x SATA SSD Intel D3-S4510 3.84 TB
|
|
||||||
- 2x Xeon Gold 6242 (16 cores @ 2.8 GHz)
|
|
||||||
- 384 GB RAM
|
|
||||||
- 1x 25 GbE network interface (Mellanox ConnectX-4 LX), connected to a Juniper QFX5200 switch
|
|
||||||
|
|
||||||
CPU powersaving was disabled. Both Vitastor and Ceph were configured with 2 OSDs per 1 SSD.
|
|
||||||
|
|
||||||
All of the results below apply to 4 KB blocks and random access (unless indicated otherwise).
|
|
||||||
|
|
||||||
T8Q64 tests were conducted over 8 400GB RBD images from all hosts (every host was running 2 instances of fio).
|
|
||||||
This is because Ceph has performance penalties related to running multiple clients over a single RBD image.
|
|
||||||
|
|
||||||
cephx_sign_messages was set to false during tests, RocksDB and Bluestore settings were left at defaults.
|
|
||||||
|
|
||||||
T8Q64 read test was conducted over 1 larger inode (3.2T) from all hosts (every host was running 2 instances of fio).
|
|
||||||
Vitastor has no performance penalties related to running multiple clients over a single inode.
|
|
||||||
If conducted from one node with all primary OSDs moved to other nodes the result was slightly lower (689000 iops),
|
|
||||||
this is because all operations resulted in network roundtrips between the client and the primary OSD.
|
|
||||||
When fio was colocated with OSDs (like in Ceph benchmarks above), 1/4 of the read workload actually
|
|
||||||
used the loopback network.
|
|
||||||
|
|
||||||
Vitastor was configured with: `--disable_data_fsync true --immediate_commit all --flusher_count 8
|
|
||||||
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096
|
|
||||||
--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024
|
|
||||||
--journal_size 16777216`.
|
|
||||||
|
|
||||||
## Raw drive performance
|
|
||||||
|
|
||||||
- T1Q1 write ~27000 iops (~0.037ms latency)
|
|
||||||
- T1Q1 read ~9800 iops (~0.101ms latency)
|
|
||||||
- T1Q32 write ~60000 iops
|
|
||||||
- T1Q32 read ~81700 iops
|
|
||||||
|
|
||||||
## 2 replicas
|
|
||||||
|
|
||||||
### Ceph 15.2.4 (Bluestore)
|
|
||||||
|
|
||||||
- T1Q1 write ~1000 iops (~1ms latency)
|
|
||||||
- T1Q1 read ~1750 iops (~0.57ms latency)
|
|
||||||
- T8Q64 write ~100000 iops, total CPU usage by OSDs about 40 virtual cores on each node
|
|
||||||
- T8Q64 read ~480000 iops, total CPU usage by OSDs about 40 virtual cores on each node
|
|
||||||
|
|
||||||
In fact, not that bad for Ceph. These servers are an example of well-balanced Ceph nodes.
|
|
||||||
However, CPU usage and I/O latency were through the roof, as usual.
|
|
||||||
|
|
||||||
### Vitastor 0.4.0 (native)
|
|
||||||
|
|
||||||
- T1Q1 write: 7087 iops (0.14ms latency)
|
|
||||||
- T1Q1 read: 6838 iops (0.145ms latency)
|
|
||||||
- T2Q64 write: 162000 iops, total CPU usage by OSDs about 3 virtual cores on each node
|
|
||||||
- T8Q64 read: 895000 iops, total CPU usage by OSDs about 4 virtual cores on each node
|
|
||||||
- Linear write (4M T1Q32): 2800 MB/s
|
|
||||||
- Linear read (4M T1Q32): 1500 MB/s
|
|
||||||
|
|
||||||
### Vitastor 0.4.0 (NBD)
|
|
||||||
|
|
||||||
NBD is currently required to mount Vitastor via kernel, but it imposes additional overhead
|
|
||||||
due to additional copying between the kernel and userspace. This mostly hurts linear
|
|
||||||
bandwidth, not iops.
|
|
||||||
|
|
||||||
Vitastor with single-threaded NBD on the same hardware:
|
|
||||||
- T1Q1 write: 6000 iops (0.166ms latency)
|
|
||||||
- T1Q1 read: 5518 iops (0.18ms latency)
|
|
||||||
- T1Q128 write: 94400 iops
|
|
||||||
- T1Q128 read: 103000 iops
|
|
||||||
- Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio)
|
|
||||||
- Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio)
|
|
||||||
|
|
||||||
## EC/XOR 2+1
|
|
||||||
|
|
||||||
### Ceph 15.2.4
|
|
||||||
|
|
||||||
- T1Q1 write: 730 iops (~1.37ms latency)
|
|
||||||
- T1Q1 read: 1500 iops with cold cache (~0.66ms latency), 2300 iops after 2 minute metadata cache warmup (~0.435ms latency)
|
|
||||||
- T4Q128 write (4 RBD images): 45300 iops, total CPU usage by OSDs about 30 virtual cores on each node
|
|
||||||
- T8Q64 read (4 RBD images): 278600 iops, total CPU usage by OSDs about 40 virtual cores on each node
|
|
||||||
- Linear write (4M T1Q32): 1950 MB/s before preallocation, 2500 MB/s after preallocation
|
|
||||||
- Linear read (4M T1Q32): 2400 MB/s
|
|
||||||
|
|
||||||
### Vitastor 0.4.0
|
|
||||||
|
|
||||||
- T1Q1 write: 2808 iops (~0.355ms latency)
|
|
||||||
- T1Q1 read: 6190 iops (~0.16ms latency)
|
|
||||||
- T2Q64 write: 85500 iops, total CPU usage by OSDs about 3.4 virtual cores on each node
|
|
||||||
- T8Q64 read: 812000 iops, total CPU usage by OSDs about 4.7 virtual cores on each node
|
|
||||||
- Linear write (4M T1Q32): 3200 MB/s
|
|
||||||
- Linear read (4M T1Q32): 1800 MB/s
|
|
@@ -1,46 +0,0 @@
|
|||||||
---
|
|
||||||
title: Vitastor's Theoretical Maximum Performance
|
|
||||||
weight: 3
|
|
||||||
---
|
|
||||||
|
|
||||||
Replicated setups:
|
|
||||||
- Single-threaded (T1Q1) read latency: 1 network roundtrip + 1 disk read.
|
|
||||||
- Single-threaded write+fsync latency:
|
|
||||||
- With immediate commit: 2 network roundtrips + 1 disk write.
|
|
||||||
- With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush.
|
|
||||||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
|
|
||||||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)).
|
|
||||||
|
|
||||||
EC/XOR setups:
|
|
||||||
- Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read.
|
|
||||||
- Single-threaded write+fsync latency:
|
|
||||||
- With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes.
|
|
||||||
- With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs.
|
|
||||||
- 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when
|
|
||||||
the read sub-operation can be served locally.
|
|
||||||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
|
|
||||||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)).
|
|
||||||
In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula.
|
|
||||||
|
|
||||||
Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
|
||||||
1. Journal block write
|
|
||||||
2. Journal data write
|
|
||||||
3. Metadata block write
|
|
||||||
4. Another journal block write for EC/XOR setups
|
|
||||||
5. Data block write
|
|
||||||
|
|
||||||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
|
||||||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
|
||||||
|
|
||||||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
|
||||||
written when they fill up or fsync is requested.
|
|
||||||
|
|
||||||
## In Practice
|
|
||||||
|
|
||||||
In practice, using tests from [Understanding Performance]({{< ref "performance/understanding" >}})
|
|
||||||
and good server-grade SSD/NVMe drives, you should head for:
|
|
||||||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
|
||||||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
|
||||||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
|
||||||
|
|
||||||
If your results are lower, that may mean you have bad drives, bad network or some kind of misconfiguration.
|
|
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
title: Tuning
|
|
||||||
weight: 2
|
|
||||||
---
|
|
||||||
|
|
||||||
- Disable CPU powersaving
|
|
@@ -1,52 +0,0 @@
|
|||||||
---
|
|
||||||
title: Understanding Storage Performance
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
The most important thing for fast storage is latency, not parallel iops.
|
|
||||||
|
|
||||||
The best possible latency is achieved with one thread and queue depth of 1 which basically means
|
|
||||||
"client load as low as possible". In this case IOPS = 1/latency, and this number doesn't
|
|
||||||
scale with number of servers, drives, server processes or threads and so on.
|
|
||||||
Single-threaded IOPS and latency numbers only depend on *how fast a single daemon is*.
|
|
||||||
|
|
||||||
Why is it important? It's important because some of the applications *can't* use
|
|
||||||
queue depth greater than 1 because their task isn't parallelizable. A notable example
|
|
||||||
is any ACID DBMS because all of them write their WALs sequentially with fsync()s.
|
|
||||||
|
|
||||||
fsync, by the way, is another important thing often missing in benchmarks. The point is
|
|
||||||
that drives have cache buffers and don't guarantee that your data is actually persisted
|
|
||||||
until you call fsync() which is translated to a FLUSH CACHE command by the OS.
|
|
||||||
|
|
||||||
Desktop SSDs are very fast without fsync - NVMes, for example, can process ~80000 write
|
|
||||||
operations per second with queue depth of 1 without fsync - but they're really slow with
|
|
||||||
fsync because they have to actually write data to flash chips when you call fsync. Typical
|
|
||||||
number is around 1000-2000 iops with fsync.
|
|
||||||
|
|
||||||
Server SSDs often have supercapacitors that act as a built-in UPS and allow the drive
|
|
||||||
to flush its DRAM cache to the persistent flash storage when a power loss occurs.
|
|
||||||
This makes them perform equally well with and without fsync. This feature is called
|
|
||||||
"Advanced Power Loss Protection" by Intel; other vendors either call it similarly
|
|
||||||
or directly as "Full Capacitor-Based Power Loss Protection".
|
|
||||||
|
|
||||||
All software-defined storages that I currently know are slow in terms of latency.
|
|
||||||
Notable examples are Ceph and internal SDSes used by cloud providers like Amazon, Google,
|
|
||||||
Yandex and so on. They're all slow and can only reach ~0.3ms read and ~0.6ms 4 KB write latency
|
|
||||||
with best-in-slot hardware.
|
|
||||||
|
|
||||||
And that's in the SSD era when you can buy an SSD that has ~0.04ms latency for 100 $.
|
|
||||||
|
|
||||||
I use the following 6 commands with small variations to benchmark any storage:
|
|
||||||
|
|
||||||
- Linear write:
|
|
||||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=write -runtime=60 -filename=/dev/sdX`
|
|
||||||
- Linear read:
|
|
||||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=read -runtime=60 -filename=/dev/sdX`
|
|
||||||
- Random write latency (T1Q1, this hurts storages the most):
|
|
||||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -fsync=1 -rw=randwrite -runtime=60 -filename=/dev/sdX`
|
|
||||||
- Random read latency (T1Q1):
|
|
||||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -rw=randread -runtime=60 -filename=/dev/sdX`
|
|
||||||
- Parallel write iops (use numjobs if a single CPU core is insufficient to saturate the load):
|
|
||||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randwrite -runtime=60 -filename=/dev/sdX`
|
|
||||||
- Parallel read iops (use numjobs if a single CPU core is insufficient to saturate the load):
|
|
||||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randread -runtime=60 -filename=/dev/sdX`
|
|
@@ -1,183 +0,0 @@
|
|||||||
---
|
|
||||||
title: Vitastor CLI
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
vitastor-cli is a command-line tool for administrative tasks like image management.
|
|
||||||
|
|
||||||
It supports the following commands:
|
|
||||||
|
|
||||||
{{< toc >}}
|
|
||||||
|
|
||||||
Global options:
|
|
||||||
|
|
||||||
```
|
|
||||||
--etcd_address ADDR Etcd connection address
|
|
||||||
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
|
|
||||||
--parallel_osds M Work with M osds in parallel when possible (default 4)
|
|
||||||
--progress 1|0 Report progress (default 1)
|
|
||||||
--cas 1|0 Use online CAS writes when possible (default auto)
|
|
||||||
--no-color Disable colored output
|
|
||||||
--json JSON output
|
|
||||||
```
|
|
||||||
|
|
||||||
## status
|
|
||||||
|
|
||||||
`vitastor-cli status`
|
|
||||||
|
|
||||||
Show cluster status.
|
|
||||||
|
|
||||||
Example output:
|
|
||||||
|
|
||||||
```
|
|
||||||
cluster:
|
|
||||||
etcd: 1 / 1 up, 1.8 M database size
|
|
||||||
mon: 1 up, master stump
|
|
||||||
osd: 8 / 12 up
|
|
||||||
|
|
||||||
data:
|
|
||||||
raw: 498.5 G used, 301.2 G / 799.7 G available, 399.8 G down
|
|
||||||
state: 156.6 G clean, 97.6 G misplaced
|
|
||||||
pools: 2 / 3 active
|
|
||||||
pgs: 30 active
|
|
||||||
34 active+has_misplaced
|
|
||||||
32 offline
|
|
||||||
|
|
||||||
io:
|
|
||||||
client: 0 B/s rd, 0 op/s rd, 0 B/s wr, 0 op/s wr
|
|
||||||
rebalance: 989.8 M/s, 7.9 K op/s
|
|
||||||
```
|
|
||||||
|
|
||||||
## df
|
|
||||||
|
|
||||||
`vitastor-cli df`
|
|
||||||
|
|
||||||
Show pool space statistics.
|
|
||||||
|
|
||||||
Example output:
|
|
||||||
|
|
||||||
```
|
|
||||||
NAME SCHEME PGS TOTAL USED AVAILABLE USED% EFFICIENCY
|
|
||||||
testpool 2/1 32 100 G 34.2 G 60.7 G 39.23% 100%
|
|
||||||
size1 1/1 32 199.9 G 10 G 121.5 G 39.23% 100%
|
|
||||||
kaveri 2/1 32 0 B 10 G 0 B 100% 0%
|
|
||||||
```
|
|
||||||
|
|
||||||
In the example above, "kaveri" pool has "zero" efficiency because all its OSD are down.
|
|
||||||
|
|
||||||
## ls
|
|
||||||
|
|
||||||
`vitastor-cli ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<glob> ...]`
|
|
||||||
|
|
||||||
List images (only matching `<glob>` pattern(s) if passed).
|
|
||||||
|
|
||||||
Options:
|
|
||||||
|
|
||||||
```
|
|
||||||
-p|--pool POOL Filter images by pool ID or name
|
|
||||||
-l|--long Also report allocated size and I/O statistics
|
|
||||||
--del Also include delete operation statistics
|
|
||||||
--sort FIELD Sort by specified field (name, size, used_size, <read|write|delete>_<iops|bps|lat|queue>)
|
|
||||||
-r|--reverse Sort in descending order
|
|
||||||
-n|--count N Only list first N items
|
|
||||||
```
|
|
||||||
|
|
||||||
Example output:
|
|
||||||
|
|
||||||
```
|
|
||||||
NAME POOL SIZE USED READ IOPS QUEUE LAT WRITE IOPS QUEUE LAT FLAGS PARENT
|
|
||||||
debian9 testpool 20 G 12.3 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO
|
|
||||||
pve/vm-100-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9
|
|
||||||
pve/base-101-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO debian9
|
|
||||||
pve/vm-102-disk-0 testpool 32 G 36.4 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - pve/base-101-disk-0
|
|
||||||
debian9-test testpool 20 G 36.6 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9
|
|
||||||
bench testpool 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us -
|
|
||||||
bench-kaveri kaveri 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us -
|
|
||||||
```
|
|
||||||
|
|
||||||
## create
|
|
||||||
|
|
||||||
`vitastor-cli create -s|--size <size> [-p|--pool <id|name>] [--parent <parent_name>[@<snapshot>]] <name>`
|
|
||||||
|
|
||||||
Create an image. You may use K/M/G/T suffixes for `<size>`. If `--parent` is specified,
|
|
||||||
a copy-on-write image clone is created. Parent must be a snapshot (readonly image).
|
|
||||||
Pool must be specified if there is more than one pool.
|
|
||||||
|
|
||||||
```
|
|
||||||
vitastor-cli create --snapshot <snapshot> [-p|--pool <id|name>] <image>
|
|
||||||
vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|
||||||
```
|
|
||||||
|
|
||||||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
|
||||||
|
|
||||||
## modify
|
|
||||||
|
|
||||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
|
||||||
|
|
||||||
Rename, resize image or change its readonly status. Images with children can't be made read-write.
|
|
||||||
If the new size is smaller than the old size, extra data will be purged.
|
|
||||||
You should resize file system in the image, if present, before shrinking it.
|
|
||||||
|
|
||||||
```
|
|
||||||
-f|--force Proceed with shrinking or setting readwrite flag even if the image has children.
|
|
||||||
```
|
|
||||||
|
|
||||||
## rm
|
|
||||||
|
|
||||||
`vitastor-cli rm <from> [<to>] [--writers-stopped]`
|
|
||||||
|
|
||||||
Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`),
|
|
||||||
rebasing all their children accordingly. --writers-stopped allows merging to be a bit
|
|
||||||
more effective in case of a single 'slim' read-write child and 'fat' removed parent:
|
|
||||||
the child is merged into parent and parent is renamed to child in that case.
|
|
||||||
In other cases parent layers are always merged into children.
|
|
||||||
|
|
||||||
## flatten
|
|
||||||
|
|
||||||
`vitastor-cli flatten <layer>`
|
|
||||||
|
|
||||||
Flatten a layer, i.e. merge data and detach it from parents.
|
|
||||||
|
|
||||||
## rm-data
|
|
||||||
|
|
||||||
`vitastor-cli rm-data --pool <pool> --inode <inode> [--wait-list] [--min-offset <offset>]`
|
|
||||||
|
|
||||||
Remove inode data without changing metadata.
|
|
||||||
|
|
||||||
```
|
|
||||||
--wait-list Retrieve full objects listings before starting to remove objects.
|
|
||||||
Requires more memory, but allows to show correct removal progress.
|
|
||||||
--min-offset Purge only data starting with specified offset.
|
|
||||||
```
|
|
||||||
|
|
||||||
## merge-data
|
|
||||||
|
|
||||||
`vitastor-cli merge-data <from> <to> [--target <target>]`
|
|
||||||
|
|
||||||
Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`.
|
|
||||||
`<to>` must be a child of `<from>` and `<target>` may be one of the layers between
|
|
||||||
`<from>` and `<to>`, including `<from>` and `<to>`.
|
|
||||||
|
|
||||||
## alloc-osd
|
|
||||||
|
|
||||||
`vitastor-cli alloc-osd`
|
|
||||||
|
|
||||||
Allocate a new OSD number and reserve it by creating empty `/osd/stats/<n>` key.
|
|
||||||
|
|
||||||
## simple-offsets
|
|
||||||
|
|
||||||
`vitastor-cli simple-offsets <device>`
|
|
||||||
|
|
||||||
Calculate offsets for simple&stupid (no superblock) OSD deployment.
|
|
||||||
|
|
||||||
Options:
|
|
||||||
|
|
||||||
```
|
|
||||||
--object_size 128k Set blockstore block size
|
|
||||||
--bitmap_granularity 4k Set bitmap granularity
|
|
||||||
--journal_size 16M Set journal size
|
|
||||||
--device_block_size 4k Set device block size
|
|
||||||
--journal_offset 0 Set journal offset
|
|
||||||
--device_size 0 Set device size
|
|
||||||
--format text Result format: json, options, env, or text
|
|
||||||
```
|
|
@@ -1,20 +0,0 @@
|
|||||||
---
|
|
||||||
title: NBD
|
|
||||||
weight: 6
|
|
||||||
---
|
|
||||||
|
|
||||||
To create a local block device for a Vitastor image, use NBD. For example:
|
|
||||||
|
|
||||||
```
|
|
||||||
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
|
||||||
```
|
|
||||||
|
|
||||||
It will output the device name, like /dev/nbd0 which you can then format and mount as a normal block device.
|
|
||||||
|
|
||||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
|
||||||
|
|
||||||
To unmap the device run:
|
|
||||||
|
|
||||||
```
|
|
||||||
vitastor-nbd unmap /dev/nbd0
|
|
||||||
```
|
|
@@ -1,39 +0,0 @@
|
|||||||
---
|
|
||||||
title: QEMU and qemu-img
|
|
||||||
weight: 2
|
|
||||||
---
|
|
||||||
|
|
||||||
You need patched QEMU version to use Vitastor driver.
|
|
||||||
|
|
||||||
To start a VM using plain QEMU command-line with Vitastor disk, use the following commands:
|
|
||||||
|
|
||||||
Old syntax (-drive):
|
|
||||||
|
|
||||||
```
|
|
||||||
qemu-system-x86_64 -enable-kvm -m 1024 \
|
|
||||||
-drive 'file=vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian9',
|
|
||||||
format=raw,if=none,id=drive-virtio-disk0,cache=none \
|
|
||||||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
|
|
||||||
id=virtio-disk0,bootindex=1,write-cache=off' \
|
|
||||||
-vnc 0.0.0.0:0
|
|
||||||
```
|
|
||||||
|
|
||||||
New syntax (-blockdev):
|
|
||||||
|
|
||||||
```
|
|
||||||
qemu-system-x86_64 -enable-kvm -m 1024 \
|
|
||||||
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
|
|
||||||
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
|
|
||||||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
|
|
||||||
id=virtio-disk0,bootindex=1,write-cache=off' \
|
|
||||||
-vnc 0.0.0.0:0
|
|
||||||
```
|
|
||||||
|
|
||||||
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename. For example:
|
|
||||||
|
|
||||||
```
|
|
||||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian10'
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
|
||||||
if you don't want to use inode metadata.
|
|
@@ -1,37 +0,0 @@
|
|||||||
---
|
|
||||||
nav_navigation: Навигация
|
|
||||||
nav_tags: Теги
|
|
||||||
nav_more: Подробнее
|
|
||||||
nav_top: К началу
|
|
||||||
|
|
||||||
form_placeholder_search: Поиск
|
|
||||||
|
|
||||||
error_page_title: Открыта несуществующая страница
|
|
||||||
error_message_title: Потерялись?
|
|
||||||
error_message_code: Ошибка 404
|
|
||||||
error_message_text: >
|
|
||||||
Похоже, страница, которую вы открыли, не существует. Попробуйте найти
|
|
||||||
нужную информацию с <a class="gdoc-error__link" href="{{ . }}">главной страницы</a>.
|
|
||||||
|
|
||||||
button_toggle_dark: Переключить тёмный/светлый/авто режим
|
|
||||||
button_nav_open: Показать навигацию
|
|
||||||
button_nav_close: Скрыть навигацию
|
|
||||||
button_menu_open: Открыть меню
|
|
||||||
button_menu_close: Закрыть меню
|
|
||||||
button_homepage: На главную
|
|
||||||
|
|
||||||
title_anchor_prefix: "Ссылка на:"
|
|
||||||
|
|
||||||
posts_read_more: Читать подробнее
|
|
||||||
posts_read_time:
|
|
||||||
one: "Одна минута на чтение"
|
|
||||||
other: "{{ . }} минут(ы) на чтение"
|
|
||||||
posts_update_prefix: Обновлено
|
|
||||||
|
|
||||||
footer_build_with: >
|
|
||||||
Сделано на <a href="https://gohugo.io/" class="gdoc-footer__link">Hugo</a> с
|
|
||||||
<svg class="icon gdoc_heart"><use xlink:href="#gdoc_heart"></use></svg>
|
|
||||||
footer_legal_notice: Правовая информация
|
|
||||||
footer_privacy_policy: Приватность
|
|
||||||
|
|
||||||
language_switch_no_tranlation_prefix: "Страница не переведена:"
|
|
@@ -1,34 +0,0 @@
|
|||||||
<footer class="gdoc-footer">
|
|
||||||
<div class="container flex">
|
|
||||||
<div class="flex flex-wrap" style="flex: 1">
|
|
||||||
<span class="gdoc-footer__item gdoc-footer__item--row">
|
|
||||||
© Vitaliy Filippov, 2021+
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
<div class="flex flex-wrap">
|
|
||||||
{{ with .Site.Params.GeekdocLegalNotice }}
|
|
||||||
<span class="gdoc-footer__item gdoc-footer__item--row">
|
|
||||||
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_legal_notice" }}</a>
|
|
||||||
</span>
|
|
||||||
{{ end }}
|
|
||||||
{{ with .Site.Params.GeekdocPrivacyPolicy }}
|
|
||||||
<span class="gdoc-footer__item gdoc-footer__item--row">
|
|
||||||
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_privacy_policy" }}</a>
|
|
||||||
</span>
|
|
||||||
{{ end }}
|
|
||||||
</div>
|
|
||||||
{{ if (default true .Site.Params.GeekdocBackToTop) }}
|
|
||||||
<div class="flex flex-25 justify-end">
|
|
||||||
<span class="gdoc-footer__item gdoc-footer__item--row" style="margin-right: 50px">
|
|
||||||
{{ i18n "footer_build_with" | safeHTML }}
|
|
||||||
</span>
|
|
||||||
<span class="gdoc-footer__item">
|
|
||||||
<a class="gdoc-footer__link fake-link" href="#" aria-label="{{ i18n "nav_top" }}">
|
|
||||||
<svg class="icon gdoc_keyboard_arrow_up"><use xlink:href="#gdoc_keyboard_arrow_up"></use></svg>
|
|
||||||
<span class="hidden-mobile">{{ i18n "nav_top" }}</span>
|
|
||||||
</a>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
{{ end }}
|
|
||||||
</div>
|
|
||||||
</footer>
|
|
@@ -1,215 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
|
||||||
<svg
|
|
||||||
xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
|
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
||||||
xmlns:cc="http://creativecommons.org/ns#"
|
|
||||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
||||||
xmlns:svg="http://www.w3.org/2000/svg"
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
|
||||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
|
||||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
|
||||||
sodipodi:docname="logo_only2.svg"
|
|
||||||
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
|
|
||||||
id="svg1340"
|
|
||||||
version="1.1"
|
|
||||||
viewBox="0 0 100 86.80192"
|
|
||||||
height="86.801918mm"
|
|
||||||
width="100mm"
|
|
||||||
inkscape:export-filename="/var/home/vitali/SVN/vitastor/presentation/logos/logo_only.png"
|
|
||||||
inkscape:export-xdpi="92.889999"
|
|
||||||
inkscape:export-ydpi="92.889999">
|
|
||||||
<defs
|
|
||||||
id="defs1334">
|
|
||||||
<linearGradient
|
|
||||||
osb:paint="gradient"
|
|
||||||
id="linearGradient866">
|
|
||||||
<stop
|
|
||||||
id="stop862"
|
|
||||||
offset="0"
|
|
||||||
style="stop-color:#c0c0c0;stop-opacity:1" />
|
|
||||||
<stop
|
|
||||||
id="stop864"
|
|
||||||
offset="1"
|
|
||||||
style="stop-color:#000000;stop-opacity:0" />
|
|
||||||
</linearGradient>
|
|
||||||
<linearGradient
|
|
||||||
id="linearGradient846"
|
|
||||||
osb:paint="gradient">
|
|
||||||
<stop
|
|
||||||
style="stop-color:#ffd42a;stop-opacity:1"
|
|
||||||
offset="0"
|
|
||||||
id="stop842" />
|
|
||||||
<stop
|
|
||||||
style="stop-color:#ffa200;stop-opacity:1"
|
|
||||||
offset="1"
|
|
||||||
id="stop844" />
|
|
||||||
</linearGradient>
|
|
||||||
<radialGradient
|
|
||||||
r="50"
|
|
||||||
fy="159.11139"
|
|
||||||
fx="202.36813"
|
|
||||||
cy="159.11139"
|
|
||||||
cx="202.36813"
|
|
||||||
gradientTransform="matrix(1.2462942,-1.2279529,0.77712408,0.78873143,-190.96813,230.1331)"
|
|
||||||
gradientUnits="userSpaceOnUse"
|
|
||||||
id="radialGradient1530"
|
|
||||||
xlink:href="#linearGradient1352"
|
|
||||||
inkscape:collect="always" />
|
|
||||||
<linearGradient
|
|
||||||
inkscape:collect="always"
|
|
||||||
id="linearGradient1352">
|
|
||||||
<stop
|
|
||||||
style="stop-color:#00c9e6;stop-opacity:1"
|
|
||||||
offset="0"
|
|
||||||
id="stop1348" />
|
|
||||||
<stop
|
|
||||||
style="stop-color:#5240d3;stop-opacity:1"
|
|
||||||
offset="1"
|
|
||||||
id="stop1350" />
|
|
||||||
</linearGradient>
|
|
||||||
<linearGradient
|
|
||||||
y2="62.555599"
|
|
||||||
x2="51.484566"
|
|
||||||
y1="62.555599"
|
|
||||||
x1="38.105473"
|
|
||||||
gradientTransform="rotate(-16.930773,271.11609,-412.42594)"
|
|
||||||
gradientUnits="userSpaceOnUse"
|
|
||||||
id="linearGradient1508"
|
|
||||||
xlink:href="#linearGradient1323"
|
|
||||||
inkscape:collect="always" />
|
|
||||||
<linearGradient
|
|
||||||
inkscape:collect="always"
|
|
||||||
id="linearGradient1323">
|
|
||||||
<stop
|
|
||||||
style="stop-color:#000000;stop-opacity:0.47178105"
|
|
||||||
offset="0"
|
|
||||||
id="stop1319" />
|
|
||||||
<stop
|
|
||||||
style="stop-color:#eeaaff;stop-opacity:0;"
|
|
||||||
offset="1"
|
|
||||||
id="stop1321" />
|
|
||||||
</linearGradient>
|
|
||||||
<radialGradient
|
|
||||||
r="21.541935"
|
|
||||||
fy="24.614815"
|
|
||||||
fx="45.312912"
|
|
||||||
cy="24.614815"
|
|
||||||
cx="45.312912"
|
|
||||||
gradientTransform="matrix(1.0933447,0.13113705,-0.12664108,1.0558599,-1.082187,93.974708)"
|
|
||||||
gradientUnits="userSpaceOnUse"
|
|
||||||
id="radialGradient1504"
|
|
||||||
xlink:href="#linearGradient846"
|
|
||||||
inkscape:collect="always" />
|
|
||||||
<filter
|
|
||||||
style="color-interpolation-filters:sRGB"
|
|
||||||
inkscape:label="Drop Shadow"
|
|
||||||
id="filter1497"
|
|
||||||
width="2"
|
|
||||||
height="2"
|
|
||||||
x="-0.5"
|
|
||||||
y="-0.5">
|
|
||||||
<feFlood
|
|
||||||
flood-opacity="0.498039"
|
|
||||||
flood-color="rgb(0,0,0)"
|
|
||||||
result="flood"
|
|
||||||
id="feFlood1487" />
|
|
||||||
<feComposite
|
|
||||||
in="flood"
|
|
||||||
in2="SourceGraphic"
|
|
||||||
operator="in"
|
|
||||||
result="composite1"
|
|
||||||
id="feComposite1489" />
|
|
||||||
<feGaussianBlur
|
|
||||||
in="composite1"
|
|
||||||
stdDeviation="6"
|
|
||||||
result="blur"
|
|
||||||
id="feGaussianBlur1491" />
|
|
||||||
<feOffset
|
|
||||||
dx="0"
|
|
||||||
dy="6"
|
|
||||||
result="offset"
|
|
||||||
id="feOffset1493" />
|
|
||||||
<feComposite
|
|
||||||
in="offset"
|
|
||||||
in2="offset"
|
|
||||||
operator="atop"
|
|
||||||
result="composite2"
|
|
||||||
id="feComposite1495" />
|
|
||||||
</filter>
|
|
||||||
<radialGradient
|
|
||||||
r="21.541935"
|
|
||||||
fy="24.614815"
|
|
||||||
fx="45.312912"
|
|
||||||
cy="24.614815"
|
|
||||||
cx="45.312912"
|
|
||||||
gradientTransform="matrix(1.0933447,0.13113705,-0.12664108,1.0558599,-1.082187,93.974708)"
|
|
||||||
gradientUnits="userSpaceOnUse"
|
|
||||||
id="radialGradient1506"
|
|
||||||
xlink:href="#linearGradient846"
|
|
||||||
inkscape:collect="always" />
|
|
||||||
</defs>
|
|
||||||
<sodipodi:namedview
|
|
||||||
inkscape:window-maximized="1"
|
|
||||||
inkscape:window-y="0"
|
|
||||||
inkscape:window-x="0"
|
|
||||||
inkscape:window-height="992"
|
|
||||||
inkscape:window-width="1920"
|
|
||||||
fit-margin-bottom="0"
|
|
||||||
fit-margin-right="0"
|
|
||||||
fit-margin-left="0"
|
|
||||||
fit-margin-top="-30"
|
|
||||||
showgrid="false"
|
|
||||||
inkscape:document-rotation="0"
|
|
||||||
inkscape:current-layer="layer1"
|
|
||||||
inkscape:document-units="mm"
|
|
||||||
inkscape:cy="47.914558"
|
|
||||||
inkscape:cx="-103.69646"
|
|
||||||
inkscape:zoom="0.7"
|
|
||||||
inkscape:pageshadow="2"
|
|
||||||
inkscape:pageopacity="1"
|
|
||||||
borderopacity="1.0"
|
|
||||||
bordercolor="#666666"
|
|
||||||
pagecolor="#000000"
|
|
||||||
id="base" />
|
|
||||||
<metadata
|
|
||||||
id="metadata1337">
|
|
||||||
<rdf:RDF>
|
|
||||||
<cc:Work
|
|
||||||
rdf:about="">
|
|
||||||
<dc:format>image/svg+xml</dc:format>
|
|
||||||
<dc:type
|
|
||||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
|
||||||
<dc:title></dc:title>
|
|
||||||
</cc:Work>
|
|
||||||
</rdf:RDF>
|
|
||||||
</metadata>
|
|
||||||
<g
|
|
||||||
transform="translate(-133.26969,-52.101187)"
|
|
||||||
id="layer1"
|
|
||||||
inkscape:groupmode="layer"
|
|
||||||
inkscape:label="Слой 1">
|
|
||||||
<path
|
|
||||||
style="fill:url(#radialGradient1530);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
|
||||||
d="m 133.26969,59.089473 50,75.000087 50,-75.000087 z"
|
|
||||||
id="path1528"
|
|
||||||
sodipodi:nodetypes="cccc" />
|
|
||||||
<path
|
|
||||||
d="m 194.29572,89.403603 -8.41706,2.562119 -2.50682,7.49308 7.17785,23.579008 9.60097,-14.40173 z"
|
|
||||||
style="fill:url(#linearGradient1508);fill-opacity:1;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.501961"
|
|
||||||
id="path1459" />
|
|
||||||
<g
|
|
||||||
transform="translate(135.70225,-49.385894)"
|
|
||||||
id="g1465">
|
|
||||||
<path
|
|
||||||
id="path1461"
|
|
||||||
style="fill:url(#radialGradient1504);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;filter:url(#filter1497)"
|
|
||||||
d="m 28.817436,101.36529 c 3.112699,10.74423 6.225077,21.48892 9.333984,32.23438 2.519532,0 5.039063,0 7.558594,0 -0.985406,8.09729 -2.085815,16.18202 -2.951172,24.29297 -0.06053,0.88723 1.098131,1.61652 1.76,0.9155 1.007514,-1.05482 1.676008,-2.3829 2.528566,-3.56053 7.51538,-11.37722 14.987447,-22.78299 22.482919,-34.17333 -3.239584,0 -6.479167,0 -9.71875,0 2.887267,-6.79562 5.775365,-13.59088 8.662109,-20.38672 -13.284505,0 -26.56901,0 -39.853516,0 0.06576,0.22591 0.131511,0.45182 0.197266,0.67773 z" />
|
|
||||||
<path
|
|
||||||
sodipodi:nodetypes="cccccccc"
|
|
||||||
id="path1463"
|
|
||||||
d="m 30.735882,102.2764 h 35.342242 l -8.662729,20.3854 h 9.173783 l -22.106472,33.62346 3.027029,-24.27377 H 39.34604 Z"
|
|
||||||
style="fill:url(#radialGradient1506);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
|
|
||||||
</g>
|
|
||||||
</g>
|
|
||||||
</svg>
|
|
Before Width: | Height: | Size: 7.4 KiB |
@@ -1,138 +0,0 @@
|
|||||||
/* Global customization */
|
|
||||||
|
|
||||||
:root {
|
|
||||||
--code-max-height: 60rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Light mode theming */
|
|
||||||
:root,
|
|
||||||
:root[color-mode="light"] {
|
|
||||||
--header-background: #404050;
|
|
||||||
--header-font-color: #ffffff;
|
|
||||||
|
|
||||||
--body-background: #ffffff;
|
|
||||||
--body-font-color: #343a40;
|
|
||||||
|
|
||||||
--button-background: #62cb97;
|
|
||||||
--button-border-color: #4ec58a;
|
|
||||||
|
|
||||||
--link-color: #c54e8a;
|
|
||||||
--link-color-visited: #c54e8a;
|
|
||||||
|
|
||||||
--code-background: #f5f6f8;
|
|
||||||
--code-accent-color: #e3e7eb;
|
|
||||||
--code-accent-color-lite: #eff1f3;
|
|
||||||
|
|
||||||
--accent-color: #e9ecef;
|
|
||||||
--accent-color-lite: #f8f9fa;
|
|
||||||
|
|
||||||
--control-icons: #b2bac1;
|
|
||||||
|
|
||||||
--footer-background: #606070;
|
|
||||||
--footer-font-color: #ffffff;
|
|
||||||
--footer-link-color: #ffcc5c;
|
|
||||||
--footer-link-color-visited: #ffcc5c;
|
|
||||||
}
|
|
||||||
@media (prefers-color-scheme: light) {
|
|
||||||
:root {
|
|
||||||
--header-background: #404050;
|
|
||||||
--header-font-color: #ffffff;
|
|
||||||
|
|
||||||
--body-background: #ffffff;
|
|
||||||
--body-font-color: #343a40;
|
|
||||||
|
|
||||||
--button-background: #62cb97;
|
|
||||||
--button-border-color: #4ec58a;
|
|
||||||
|
|
||||||
--link-color: #c54e8a;
|
|
||||||
--link-color-visited: #c54e8a;
|
|
||||||
|
|
||||||
--code-background: #f5f6f8;
|
|
||||||
--code-accent-color: #e3e7eb;
|
|
||||||
--code-accent-color-lite: #eff1f3;
|
|
||||||
|
|
||||||
--accent-color: #e9ecef;
|
|
||||||
--accent-color-lite: #f8f9fa;
|
|
||||||
|
|
||||||
--control-icons: #b2bac1;
|
|
||||||
|
|
||||||
--footer-background: #606070;
|
|
||||||
--footer-font-color: #ffffff;
|
|
||||||
--footer-link-color: #ffcc5c;
|
|
||||||
--footer-link-color-visited: #ffcc5c;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Dark mode theming */
|
|
||||||
:root[color-mode="dark"] {
|
|
||||||
--header-background: #202830;
|
|
||||||
--header-font-color: #ffffff;
|
|
||||||
|
|
||||||
--body-background: #343a44;
|
|
||||||
--body-font-color: #ced3d8;
|
|
||||||
|
|
||||||
--button-background: #62cb97;
|
|
||||||
--button-border-color: #4ec58a;
|
|
||||||
|
|
||||||
--link-color: #7ac29e;
|
|
||||||
--link-color-visited: #7ac29e;
|
|
||||||
|
|
||||||
--code-background: #2f353a;
|
|
||||||
--code-accent-color: #262b2f;
|
|
||||||
--code-accent-color-lite: #2b3035;
|
|
||||||
|
|
||||||
--accent-color: #2b3035;
|
|
||||||
--accent-color-lite: #2f353a;
|
|
||||||
|
|
||||||
--control-icons: #b2bac1;
|
|
||||||
|
|
||||||
--footer-background: #2f333e;
|
|
||||||
--footer-font-color: #cccccc;
|
|
||||||
--footer-link-color: #7ac29e;
|
|
||||||
--footer-link-color-visited: #7ac29e;
|
|
||||||
}
|
|
||||||
@media (prefers-color-scheme: dark) {
|
|
||||||
:root {
|
|
||||||
--header-background: #404070;
|
|
||||||
--header-font-color: #ffffff;
|
|
||||||
|
|
||||||
--body-background: #343a40;
|
|
||||||
--body-font-color: #ced3d8;
|
|
||||||
|
|
||||||
--button-background: #62cb97;
|
|
||||||
--button-border-color: #4ec58a;
|
|
||||||
|
|
||||||
--link-color: #7ac29e;
|
|
||||||
--link-color-visited: #7ac29e;
|
|
||||||
|
|
||||||
--code-background: #2f353a;
|
|
||||||
--code-accent-color: #262b2f;
|
|
||||||
--code-accent-color-lite: #2b3035;
|
|
||||||
|
|
||||||
--accent-color: #2b3035;
|
|
||||||
--accent-color-lite: #2f353a;
|
|
||||||
|
|
||||||
--control-icons: #b2bac1;
|
|
||||||
|
|
||||||
--footer-background: #2f333e;
|
|
||||||
--footer-font-color: #cccccc;
|
|
||||||
--footer-link-color: #7ac29e;
|
|
||||||
--footer-link-color-visited: #7ac29e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
.gdoc-brand__img {
|
|
||||||
width: 48px;
|
|
||||||
height: auto;
|
|
||||||
margin-top: -4px;
|
|
||||||
margin-bottom: -4px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.gdoc-menu-header > span {
|
|
||||||
display: flex;
|
|
||||||
flex-direction: row-reverse;
|
|
||||||
}
|
|
||||||
|
|
||||||
span.gdoc-language {
|
|
||||||
margin-right: 20px;
|
|
||||||
}
|
|
Binary file not shown.
Before Width: | Height: | Size: 709 B |
Binary file not shown.
Before Width: | Height: | Size: 1.5 KiB |
@@ -1,196 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
|
||||||
<svg
|
|
||||||
xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
|
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
||||||
xmlns:cc="http://creativecommons.org/ns#"
|
|
||||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
||||||
xmlns:svg="http://www.w3.org/2000/svg"
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
|
||||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
|
||||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
|
||||||
sodipodi:docname="favicon.svg"
|
|
||||||
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
|
|
||||||
id="svg1340"
|
|
||||||
version="1.1"
|
|
||||||
viewBox="0 0 100 100"
|
|
||||||
height="100mm"
|
|
||||||
width="100mm"
|
|
||||||
inkscape:export-filename="/var/home/vitali/SVN/vitastor/docs/static/favicon/favicon-64x64.png"
|
|
||||||
inkscape:export-xdpi="16.26"
|
|
||||||
inkscape:export-ydpi="16.26">
|
|
||||||
<defs
|
|
||||||
id="defs1334">
|
|
||||||
<linearGradient
|
|
||||||
osb:paint="gradient"
|
|
||||||
id="linearGradient866">
|
|
||||||
<stop
|
|
||||||
id="stop862"
|
|
||||||
offset="0"
|
|
||||||
style="stop-color:#c0c0c0;stop-opacity:1" />
|
|
||||||
<stop
|
|
||||||
id="stop864"
|
|
||||||
offset="1"
|
|
||||||
style="stop-color:#000000;stop-opacity:0" />
|
|
||||||
</linearGradient>
|
|
||||||
<linearGradient
|
|
||||||
id="linearGradient846"
|
|
||||||
osb:paint="gradient">
|
|
||||||
<stop
|
|
||||||
style="stop-color:#ffd42a;stop-opacity:1"
|
|
||||||
offset="0"
|
|
||||||
id="stop842" />
|
|
||||||
<stop
|
|
||||||
style="stop-color:#ffa200;stop-opacity:1"
|
|
||||||
offset="1"
|
|
||||||
id="stop844" />
|
|
||||||
</linearGradient>
|
|
||||||
<radialGradient
|
|
||||||
r="50"
|
|
||||||
fy="159.11139"
|
|
||||||
fx="202.36813"
|
|
||||||
cy="159.11139"
|
|
||||||
cx="202.36813"
|
|
||||||
gradientTransform="matrix(1.2462942,-1.2279529,0.77712408,0.78873143,-190.96813,230.1331)"
|
|
||||||
gradientUnits="userSpaceOnUse"
|
|
||||||
id="radialGradient1530"
|
|
||||||
xlink:href="#linearGradient1352"
|
|
||||||
inkscape:collect="always" />
|
|
||||||
<linearGradient
|
|
||||||
inkscape:collect="always"
|
|
||||||
id="linearGradient1352">
|
|
||||||
<stop
|
|
||||||
style="stop-color:#00c9e6;stop-opacity:1"
|
|
||||||
offset="0"
|
|
||||||
id="stop1348" />
|
|
||||||
<stop
|
|
||||||
style="stop-color:#5240d3;stop-opacity:1"
|
|
||||||
offset="1"
|
|
||||||
id="stop1350" />
|
|
||||||
</linearGradient>
|
|
||||||
<linearGradient
|
|
||||||
y2="62.555599"
|
|
||||||
x2="51.484566"
|
|
||||||
y1="62.555599"
|
|
||||||
x1="38.105473"
|
|
||||||
gradientTransform="rotate(-16.930773,271.11609,-412.42594)"
|
|
||||||
gradientUnits="userSpaceOnUse"
|
|
||||||
id="linearGradient1508"
|
|
||||||
xlink:href="#linearGradient1323"
|
|
||||||
inkscape:collect="always" />
|
|
||||||
<linearGradient
|
|
||||||
inkscape:collect="always"
|
|
||||||
id="linearGradient1323">
|
|
||||||
<stop
|
|
||||||
style="stop-color:#000000;stop-opacity:0.47178105"
|
|
||||||
offset="0"
|
|
||||||
id="stop1319" />
|
|
||||||
<stop
|
|
||||||
style="stop-color:#eeaaff;stop-opacity:0;"
|
|
||||||
offset="1"
|
|
||||||
id="stop1321" />
|
|
||||||
</linearGradient>
|
|
||||||
<filter
|
|
||||||
style="color-interpolation-filters:sRGB"
|
|
||||||
inkscape:label="Drop Shadow"
|
|
||||||
id="filter1497"
|
|
||||||
width="2"
|
|
||||||
height="2"
|
|
||||||
x="-0.5"
|
|
||||||
y="-0.5">
|
|
||||||
<feFlood
|
|
||||||
flood-opacity="0.498039"
|
|
||||||
flood-color="rgb(0,0,0)"
|
|
||||||
result="flood"
|
|
||||||
id="feFlood1487" />
|
|
||||||
<feComposite
|
|
||||||
in="flood"
|
|
||||||
in2="SourceGraphic"
|
|
||||||
operator="in"
|
|
||||||
result="composite1"
|
|
||||||
id="feComposite1489" />
|
|
||||||
<feGaussianBlur
|
|
||||||
in="composite1"
|
|
||||||
stdDeviation="6"
|
|
||||||
result="blur"
|
|
||||||
id="feGaussianBlur1491" />
|
|
||||||
<feOffset
|
|
||||||
dx="0"
|
|
||||||
dy="6"
|
|
||||||
result="offset"
|
|
||||||
id="feOffset1493" />
|
|
||||||
<feComposite
|
|
||||||
in="offset"
|
|
||||||
in2="offset"
|
|
||||||
operator="atop"
|
|
||||||
result="composite2"
|
|
||||||
id="feComposite1495" />
|
|
||||||
</filter>
|
|
||||||
<radialGradient
|
|
||||||
r="21.541935"
|
|
||||||
fy="24.614815"
|
|
||||||
fx="45.312912"
|
|
||||||
cy="24.614815"
|
|
||||||
cx="45.312912"
|
|
||||||
gradientTransform="matrix(1.6678615,0.20004527,-0.19318681,1.6106796,108.48083,22.966962)"
|
|
||||||
gradientUnits="userSpaceOnUse"
|
|
||||||
id="radialGradient1506"
|
|
||||||
xlink:href="#linearGradient846"
|
|
||||||
inkscape:collect="always" />
|
|
||||||
</defs>
|
|
||||||
<sodipodi:namedview
|
|
||||||
inkscape:window-maximized="1"
|
|
||||||
inkscape:window-y="0"
|
|
||||||
inkscape:window-x="0"
|
|
||||||
inkscape:window-height="992"
|
|
||||||
inkscape:window-width="1920"
|
|
||||||
fit-margin-bottom="0"
|
|
||||||
fit-margin-right="0"
|
|
||||||
fit-margin-left="0"
|
|
||||||
fit-margin-top="0"
|
|
||||||
showgrid="false"
|
|
||||||
inkscape:document-rotation="0"
|
|
||||||
inkscape:current-layer="layer1"
|
|
||||||
inkscape:document-units="mm"
|
|
||||||
inkscape:cy="83.752268"
|
|
||||||
inkscape:cx="-103.69645"
|
|
||||||
inkscape:zoom="0.7"
|
|
||||||
inkscape:pageshadow="2"
|
|
||||||
inkscape:pageopacity="0"
|
|
||||||
borderopacity="1.0"
|
|
||||||
bordercolor="#666666"
|
|
||||||
pagecolor="#000000"
|
|
||||||
id="base" />
|
|
||||||
<metadata
|
|
||||||
id="metadata1337">
|
|
||||||
<rdf:RDF>
|
|
||||||
<cc:Work
|
|
||||||
rdf:about="">
|
|
||||||
<dc:format>image/svg+xml</dc:format>
|
|
||||||
<dc:type
|
|
||||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
|
||||||
<dc:title></dc:title>
|
|
||||||
</cc:Work>
|
|
||||||
</rdf:RDF>
|
|
||||||
</metadata>
|
|
||||||
<g
|
|
||||||
transform="translate(-133.26969,-35.630924)"
|
|
||||||
id="layer1"
|
|
||||||
inkscape:groupmode="layer"
|
|
||||||
inkscape:label="Слой 1">
|
|
||||||
<path
|
|
||||||
style="fill:url(#radialGradient1530);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
|
||||||
d="m 133.26969,59.089473 50,75.000087 50,-75.000087 z"
|
|
||||||
id="path1528"
|
|
||||||
sodipodi:nodetypes="cccc" />
|
|
||||||
<path
|
|
||||||
d="m 194.29572,89.403603 -8.41706,2.562119 -2.50682,7.49308 7.17785,23.579008 9.60097,-14.40173 z"
|
|
||||||
style="fill:url(#linearGradient1508);fill-opacity:1;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.501961"
|
|
||||||
id="path1459" />
|
|
||||||
<path
|
|
||||||
sodipodi:nodetypes="cccccccc"
|
|
||||||
id="path1463"
|
|
||||||
d="m 157.01826,35.630924 h 53.91343 l -13.21471,31.09726 h 13.99432 l -33.7227,51.291496 4.61762,-37.02885 h -12.45344 z"
|
|
||||||
style="fill:url(#radialGradient1506);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
|
|
||||||
</g>
|
|
||||||
</svg>
|
|
Before Width: | Height: | Size: 6.2 KiB |
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
title: Common Parameters
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
These are the most common parameters which apply to all components of Vitastor.
|
|
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
title: Общие параметры
|
|
||||||
weight: 1
|
|
||||||
---
|
|
||||||
|
|
||||||
Это наиболее общие параметры, используемые всеми компонентами Vitastor.
|
|
@@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
title: Cluster-Wide Disk Layout Parameters
|
|
||||||
weight: 2
|
|
||||||
---
|
|
||||||
|
|
||||||
These parameters apply to clients and OSDs, are fixed at the moment of OSD drive
|
|
||||||
initialization and can't be changed after it without losing data.
|
|
@@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
title: Дисковые параметры уровня кластера
|
|
||||||
weight: 2
|
|
||||||
---
|
|
||||||
|
|
||||||
Данные параметры используются клиентами и OSD, задаются в момент инициализации
|
|
||||||
диска OSD и не могут быть изменены после этого без потери данных.
|
|
@@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
title: OSD Disk Layout Parameters
|
|
||||||
weight: 3
|
|
||||||
---
|
|
||||||
|
|
||||||
These parameters apply to OSDs, are fixed at the moment of OSD drive
|
|
||||||
initialization and can't be changed after it without losing data.
|
|
@@ -1,8 +0,0 @@
|
|||||||
---
|
|
||||||
title: Дисковые параметры OSD
|
|
||||||
weight: 3
|
|
||||||
---
|
|
||||||
|
|
||||||
Данные параметры используются только OSD и, также как и общекластерные
|
|
||||||
дисковые параметры, задаются в момент инициализации дисков OSD и не могут быть
|
|
||||||
изменены после этого без потери данных.
|
|
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
title: Monitor Parameters
|
|
||||||
weight: 6
|
|
||||||
---
|
|
||||||
|
|
||||||
These parameters only apply to Monitors.
|
|
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
title: Параметры мониторов
|
|
||||||
weight: 6
|
|
||||||
---
|
|
||||||
|
|
||||||
Данные параметры используются только мониторами Vitastor.
|
|
@@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
title: Network Protocol Parameters
|
|
||||||
weight: 4
|
|
||||||
---
|
|
||||||
|
|
||||||
These parameters apply to clients and OSDs and affect network connection logic
|
|
||||||
between clients, OSDs and etcd.
|
|
@@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
title: Параметры сетевого протокола
|
|
||||||
weight: 4
|
|
||||||
---
|
|
||||||
|
|
||||||
Данные параметры используются клиентами и OSD и влияют на логику сетевого
|
|
||||||
взаимодействия между клиентами, OSD, а также etcd.
|
|
@@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
title: Runtime OSD Parameters
|
|
||||||
weight: 5
|
|
||||||
---
|
|
||||||
|
|
||||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
|
||||||
initialization and can be changed with an OSD restart.
|
|
@@ -1,8 +0,0 @@
|
|||||||
---
|
|
||||||
title: Изменяемые параметры OSD
|
|
||||||
weight: 5
|
|
||||||
---
|
|
||||||
|
|
||||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
|
||||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
|
||||||
момент с перезапуском OSD.
|
|
@@ -30,6 +30,18 @@
|
|||||||
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
||||||
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
||||||
принципе, это может применяться для клиентов со старыми версиями ядра.
|
принципе, это может применяться для клиентов со старыми версиями ядра.
|
||||||
|
- name: use_zerocopy_send
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
If true, OSDs and clients will attempt to use TCP zero-copy send
|
||||||
|
(MSG_ZEROCOPY) for big buffers. It's recommended to raise net.ipv4.tcp_wmem
|
||||||
|
and net.core.wmem_max sysctls when using this mode.
|
||||||
|
info_ru: |
|
||||||
|
Если установлено в true, то OSD и клиенты будут стараться использовать
|
||||||
|
TCP-отправку без копирования (MSG_ZEROCOPY) для больших буферов данных.
|
||||||
|
Рекомендуется поднять значения sysctl net.ipv4.tcp_wmem и net.core.wmem_max
|
||||||
|
при использовании этого режима.
|
||||||
- name: use_rdma
|
- name: use_rdma
|
||||||
type: bool
|
type: bool
|
||||||
default: true
|
default: true
|
||||||
|
@@ -248,8 +248,6 @@
|
|||||||
row and slow down significantly (from 25000+ iops to ~3000 iops). When
|
row and slow down significantly (from 25000+ iops to ~3000 iops). When
|
||||||
this option is set, Vitastor will always move to the next sector of the
|
this option is set, Vitastor will always move to the next sector of the
|
||||||
journal after writing it instead of possibly overwriting it the second time.
|
journal after writing it instead of possibly overwriting it the second time.
|
||||||
|
|
||||||
Most (99%) other SSDs don't need this option.
|
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Включайте данную опцию для SSD вроде Intel D3-S4510 и D3-S4610, которые
|
Включайте данную опцию для SSD вроде Intel D3-S4510 и D3-S4610, которые
|
||||||
ОЧЕНЬ не любят, когда ПО перезаписывает один и тот же сектор несколько раз
|
ОЧЕНЬ не любят, когда ПО перезаписывает один и тот же сектор несколько раз
|
||||||
@@ -258,8 +256,6 @@
|
|||||||
данная опция установлена, Vitastor всегда переходит к следующему сектору
|
данная опция установлена, Vitastor всегда переходит к следующему сектору
|
||||||
журнала после записи вместо потенциально повторной перезаписи того же
|
журнала после записи вместо потенциально повторной перезаписи того же
|
||||||
самого сектора.
|
самого сектора.
|
||||||
|
|
||||||
Почти все другие SSD (99% моделей) не требуют данной опции.
|
|
||||||
- name: throttle_small_writes
|
- name: throttle_small_writes
|
||||||
type: bool
|
type: bool
|
||||||
default: false
|
default: false
|
||||||
|
@@ -64,6 +64,7 @@ const etcd_tree = {
|
|||||||
// client and osd
|
// client and osd
|
||||||
tcp_header_buffer_size: 65536,
|
tcp_header_buffer_size: 65536,
|
||||||
use_sync_send_recv: false,
|
use_sync_send_recv: false,
|
||||||
|
use_zerocopy_send: false,
|
||||||
use_rdma: true,
|
use_rdma: true,
|
||||||
rdma_device: null, // for example, "rocep5s0f0"
|
rdma_device: null, // for example, "rocep5s0f0"
|
||||||
rdma_port_num: 1,
|
rdma_port_num: 1,
|
||||||
|
@@ -146,7 +146,7 @@ resume_2:
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
down_raw += kv.value["size"].uint64_value();
|
down_raw += kv.value["size"].uint64_value();
|
||||||
free_down_raw += kv.value["size"].uint64_value();
|
free_down_raw += kv.value["free"].uint64_value();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int pool_count = 0, pools_active = 0;
|
int pool_count = 0, pools_active = 0;
|
||||||
|
@@ -39,6 +39,12 @@ void osd_messenger_t::init()
|
|||||||
handle_rdma_events();
|
handle_rdma_events();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifndef SO_ZEROCOPY
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Zero-copy TCP send is not supported in this build, ignoring\n");
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||||
{
|
{
|
||||||
@@ -162,6 +168,8 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||||||
this->receive_buffer_size = 65536;
|
this->receive_buffer_size = 65536;
|
||||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||||
config["use_sync_send_recv"].uint64_value();
|
config["use_sync_send_recv"].uint64_value();
|
||||||
|
this->use_zerocopy_send = config["use_zerocopy_send"].bool_value() ||
|
||||||
|
config["use_zerocopy_send"].uint64_value();
|
||||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||||
if (!this->peer_connect_interval)
|
if (!this->peer_connect_interval)
|
||||||
this->peer_connect_interval = 5;
|
this->peer_connect_interval = 5;
|
||||||
@@ -288,8 +296,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
|||||||
on_connect_peer(peer_osd, -result);
|
on_connect_peer(peer_osd, -result);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int one = 1;
|
set_socket_options(cl);
|
||||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
|
||||||
cl->peer_state = PEER_CONNECTED;
|
cl->peer_state = PEER_CONNECTED;
|
||||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
@@ -299,6 +306,23 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
|||||||
check_peer_config(cl);
|
check_peer_config(cl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::set_socket_options(osd_client_t *cl)
|
||||||
|
{
|
||||||
|
int one = 1;
|
||||||
|
setsockopt(cl->peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||||
|
#ifdef SO_ZEROCOPY
|
||||||
|
if (!use_zerocopy_send)
|
||||||
|
cl->zerocopy_send = false;
|
||||||
|
else if (setsockopt(cl->peer_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) != 0)
|
||||||
|
{
|
||||||
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "[OSD %lu] Failed to enable zero-copy send for client %d: %s\n", this->osd_num, cl->peer_fd, strerror(errno));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
cl->zerocopy_send = true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
// Mark client as ready (i.e. some data is available)
|
// Mark client as ready (i.e. some data is available)
|
||||||
@@ -493,14 +517,13 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
|||||||
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
||||||
addr_to_string(addr).c_str());
|
addr_to_string(addr).c_str());
|
||||||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
int one = 1;
|
auto cl = clients[peer_fd] = new osd_client_t();
|
||||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
cl->peer_addr = addr;
|
||||||
clients[peer_fd] = new osd_client_t();
|
cl->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||||
clients[peer_fd]->peer_addr = addr;
|
cl->peer_fd = peer_fd;
|
||||||
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
cl->peer_state = PEER_CONNECTED;
|
||||||
clients[peer_fd]->peer_fd = peer_fd;
|
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
set_socket_options(cl);
|
||||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
|
||||||
// Add FD to epoll
|
// Add FD to epoll
|
||||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
|
@@ -45,6 +45,12 @@ struct msgr_sendp_t
|
|||||||
int flags;
|
int flags;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct msgr_zc_not_t
|
||||||
|
{
|
||||||
|
osd_op_t *op;
|
||||||
|
uint32_t nsend;
|
||||||
|
};
|
||||||
|
|
||||||
struct osd_client_t
|
struct osd_client_t
|
||||||
{
|
{
|
||||||
int refs = 0;
|
int refs = 0;
|
||||||
@@ -57,6 +63,7 @@ struct osd_client_t
|
|||||||
int ping_time_remaining = 0;
|
int ping_time_remaining = 0;
|
||||||
int idle_time_remaining = 0;
|
int idle_time_remaining = 0;
|
||||||
osd_num_t osd_num = 0;
|
osd_num_t osd_num = 0;
|
||||||
|
bool zerocopy_send = false;
|
||||||
|
|
||||||
void *in_buf = NULL;
|
void *in_buf = NULL;
|
||||||
|
|
||||||
@@ -87,6 +94,12 @@ struct osd_client_t
|
|||||||
int write_state = 0;
|
int write_state = 0;
|
||||||
std::vector<iovec> send_list, next_send_list;
|
std::vector<iovec> send_list, next_send_list;
|
||||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||||
|
std::vector<msgr_zc_not_t> zerocopy_sent;
|
||||||
|
uint64_t outbox_size = 0, next_outbox_size = 0;
|
||||||
|
uint32_t zerocopy_notification_idx = 0;
|
||||||
|
uint32_t zerocopy_notification_prev = 0;
|
||||||
|
uint8_t zerocopy_notification_buf[256];
|
||||||
|
struct msghdr zerocopy_notification_msg;
|
||||||
|
|
||||||
~osd_client_t()
|
~osd_client_t()
|
||||||
{
|
{
|
||||||
@@ -127,6 +140,7 @@ protected:
|
|||||||
int osd_ping_timeout = 0;
|
int osd_ping_timeout = 0;
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
bool use_sync_send_recv = false;
|
bool use_sync_send_recv = false;
|
||||||
|
bool use_zerocopy_send = false;
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
bool use_rdma = true;
|
bool use_rdma = true;
|
||||||
@@ -181,10 +195,12 @@ protected:
|
|||||||
void check_peer_config(osd_client_t *cl);
|
void check_peer_config(osd_client_t *cl);
|
||||||
void cancel_osd_ops(osd_client_t *cl);
|
void cancel_osd_ops(osd_client_t *cl);
|
||||||
void cancel_op(osd_op_t *op);
|
void cancel_op(osd_op_t *op);
|
||||||
|
void set_socket_options(osd_client_t *cl);
|
||||||
|
|
||||||
bool try_send(osd_client_t *cl);
|
bool try_send(osd_client_t *cl);
|
||||||
void measure_exec(osd_op_t *cur_op);
|
void measure_exec(osd_op_t *cur_op);
|
||||||
void handle_send(int result, osd_client_t *cl);
|
void handle_send(int result, osd_client_t *cl);
|
||||||
|
void handle_zerocopy_notification(osd_client_t *cl, int res);
|
||||||
|
|
||||||
bool handle_read(int result, osd_client_t *cl);
|
bool handle_read(int result, osd_client_t *cl);
|
||||||
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||||
|
@@ -6,6 +6,12 @@
|
|||||||
|
|
||||||
#include "messenger.h"
|
#include "messenger.h"
|
||||||
|
|
||||||
|
#include <linux/errqueue.h>
|
||||||
|
|
||||||
|
#ifndef MSG_ZEROCOPY
|
||||||
|
#define MSG_ZEROCOPY 0
|
||||||
|
#endif
|
||||||
|
|
||||||
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
assert(cur_op->peer_fd);
|
assert(cur_op->peer_fd);
|
||||||
@@ -36,6 +42,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
}
|
}
|
||||||
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
|
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
|
||||||
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
||||||
|
auto & to_size = cl->write_msg.msg_iovlen ? cl->next_outbox_size : cl->outbox_size;
|
||||||
if (cur_op->op_type == OSD_OP_IN)
|
if (cur_op->op_type == OSD_OP_IN)
|
||||||
{
|
{
|
||||||
measure_exec(cur_op);
|
measure_exec(cur_op);
|
||||||
@@ -46,6 +53,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
||||||
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
||||||
}
|
}
|
||||||
|
to_size += OSD_PACKET_SIZE;
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
||||||
// Bitmap
|
// Bitmap
|
||||||
if (cur_op->op_type == OSD_OP_IN &&
|
if (cur_op->op_type == OSD_OP_IN &&
|
||||||
@@ -57,6 +65,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
.iov_len = cur_op->reply.sec_rw.attr_len,
|
.iov_len = cur_op->reply.sec_rw.attr_len,
|
||||||
});
|
});
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
|
to_size += cur_op->reply.sec_rw.attr_len;
|
||||||
}
|
}
|
||||||
else if (cur_op->op_type == OSD_OP_OUT &&
|
else if (cur_op->op_type == OSD_OP_OUT &&
|
||||||
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
||||||
@@ -67,6 +76,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
.iov_len = cur_op->req.sec_rw.attr_len,
|
.iov_len = cur_op->req.sec_rw.attr_len,
|
||||||
});
|
});
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
|
to_size += cur_op->req.sec_rw.attr_len;
|
||||||
}
|
}
|
||||||
// Operation data
|
// Operation data
|
||||||
if ((cur_op->op_type == OSD_OP_IN
|
if ((cur_op->op_type == OSD_OP_IN
|
||||||
@@ -86,14 +96,21 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
assert(cur_op->iov.buf[i].iov_base);
|
assert(cur_op->iov.buf[i].iov_base);
|
||||||
to_send_list.push_back(cur_op->iov.buf[i]);
|
to_send_list.push_back(cur_op->iov.buf[i]);
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
|
to_size += cur_op->iov.buf[i].iov_len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||||
{
|
{
|
||||||
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
|
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
|
||||||
|
{
|
||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
||||||
|
to_size += cur_op->reply.hdr.retval;
|
||||||
|
}
|
||||||
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
||||||
|
{
|
||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
||||||
|
to_size += cur_op->req.sec_read_bmp.len;
|
||||||
|
}
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
}
|
}
|
||||||
if (cur_op->op_type == OSD_OP_IN)
|
if (cur_op->op_type == OSD_OP_IN)
|
||||||
@@ -177,17 +194,19 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
cl->write_msg.msg_iov = cl->send_list.data();
|
cl->write_msg.msg_iov = cl->send_list.data();
|
||||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||||
|
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
||||||
cl->refs++;
|
cl->refs++;
|
||||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||||
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
||||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, cl->write_msg.msg_flags);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cl->write_msg.msg_iov = cl->send_list.data();
|
cl->write_msg.msg_iov = cl->send_list.data();
|
||||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||||
|
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
||||||
cl->refs++;
|
cl->refs++;
|
||||||
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
|
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL | cl->write_msg.msg_flags);
|
||||||
if (result < 0)
|
if (result < 0)
|
||||||
{
|
{
|
||||||
result = -errno;
|
result = -errno;
|
||||||
@@ -197,6 +216,62 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::handle_zerocopy_notification(osd_client_t *cl, int res)
|
||||||
|
{
|
||||||
|
cl->refs--;
|
||||||
|
if (cl->peer_state == PEER_STOPPED)
|
||||||
|
{
|
||||||
|
if (cl->refs <= 0)
|
||||||
|
{
|
||||||
|
delete cl;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (res != 0)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (cl->zerocopy_notification_msg.msg_flags & MSG_CTRUNC)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "zero-copy send notification truncated on client socket %d\n", cl->peer_fd);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&cl->zerocopy_notification_msg); cm; cm = CMSG_NXTHDR(&cl->zerocopy_notification_msg, cm))
|
||||||
|
{
|
||||||
|
if (cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR)
|
||||||
|
{
|
||||||
|
struct sock_extended_err *serr = (struct sock_extended_err*)CMSG_DATA(cm);
|
||||||
|
if (serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY)
|
||||||
|
{
|
||||||
|
// completed sends numbered serr->ee_info .. serr->ee_data
|
||||||
|
int start = 0;
|
||||||
|
while (start < cl->zerocopy_sent.size() && cl->zerocopy_sent[start].nsend < serr->ee_info)
|
||||||
|
start++;
|
||||||
|
int end = start;
|
||||||
|
if (serr->ee_data < serr->ee_info)
|
||||||
|
{
|
||||||
|
// counter has wrapped around
|
||||||
|
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend >= cl->zerocopy_sent[start].nsend)
|
||||||
|
end++;
|
||||||
|
}
|
||||||
|
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend <= serr->ee_data)
|
||||||
|
end++;
|
||||||
|
if (end > start)
|
||||||
|
{
|
||||||
|
for (int i = start; i < end; i++)
|
||||||
|
{
|
||||||
|
delete cl->zerocopy_sent[i].op;
|
||||||
|
}
|
||||||
|
cl->zerocopy_sent.erase(
|
||||||
|
cl->zerocopy_sent.begin() + start,
|
||||||
|
cl->zerocopy_sent.begin() + end
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void osd_messenger_t::send_replies()
|
void osd_messenger_t::send_replies()
|
||||||
{
|
{
|
||||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||||
@@ -224,16 +299,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (result < 0 && result != -EAGAIN && result != -EINTR)
|
if (result < 0 && result != -EAGAIN && result != -EINTR && result != -ENOBUFS)
|
||||||
{
|
{
|
||||||
// this is a client socket, so don't panic. just disconnect it
|
// this is a client socket, so don't panic. just disconnect it
|
||||||
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
bool used_zerocopy = false;
|
||||||
if (result >= 0)
|
if (result >= 0)
|
||||||
{
|
{
|
||||||
|
used_zerocopy = (cl->write_msg.msg_flags & MSG_ZEROCOPY) ? true : false;
|
||||||
int done = 0;
|
int done = 0;
|
||||||
|
int bytes_written = result;
|
||||||
while (result > 0 && done < cl->send_list.size())
|
while (result > 0 && done < cl->send_list.size())
|
||||||
{
|
{
|
||||||
iovec & iov = cl->send_list[done];
|
iovec & iov = cl->send_list[done];
|
||||||
@@ -242,7 +320,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||||
{
|
{
|
||||||
// Reply fully sent
|
// Reply fully sent
|
||||||
delete cl->outbox[done].op;
|
if (!used_zerocopy)
|
||||||
|
{
|
||||||
|
delete cl->outbox[done].op;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// With zero-copy send the difference is that we must keep the buffer (i.e. the operation)
|
||||||
|
// allocated until we get send notification from MSG_ERRQUEUE
|
||||||
|
cl->zerocopy_sent.push_back((msgr_zc_not_t){
|
||||||
|
.op = cl->outbox[done].op,
|
||||||
|
.nsend = cl->zerocopy_notification_idx,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
result -= iov.iov_len;
|
result -= iov.iov_len;
|
||||||
done++;
|
done++;
|
||||||
@@ -254,6 +344,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (used_zerocopy)
|
||||||
|
{
|
||||||
|
cl->zerocopy_notification_idx++;
|
||||||
|
}
|
||||||
|
cl->outbox_size -= bytes_written;
|
||||||
if (done > 0)
|
if (done > 0)
|
||||||
{
|
{
|
||||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
||||||
@@ -263,8 +358,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
{
|
{
|
||||||
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
|
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
|
||||||
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
|
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
|
||||||
|
cl->outbox_size += cl->next_outbox_size;
|
||||||
cl->next_send_list.clear();
|
cl->next_send_list.clear();
|
||||||
cl->next_outbox.clear();
|
cl->next_outbox.clear();
|
||||||
|
cl->next_outbox_size = 0;
|
||||||
}
|
}
|
||||||
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
@@ -287,4 +384,34 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
{
|
{
|
||||||
write_ready_clients.push_back(cl->peer_fd);
|
write_ready_clients.push_back(cl->peer_fd);
|
||||||
}
|
}
|
||||||
|
if (used_zerocopy && (cl->zerocopy_notification_idx-cl->zerocopy_notification_prev) >= 16 &&
|
||||||
|
cl->zerocopy_sent.size() > 0)
|
||||||
|
{
|
||||||
|
cl->zerocopy_notification_prev = cl->zerocopy_notification_idx;
|
||||||
|
cl->zerocopy_notification_msg = {
|
||||||
|
.msg_control = cl->zerocopy_notification_buf,
|
||||||
|
.msg_controllen = sizeof(cl->zerocopy_notification_buf),
|
||||||
|
};
|
||||||
|
cl->refs++;
|
||||||
|
io_uring_sqe* sqe = NULL;
|
||||||
|
if (ringloop && !use_sync_send_recv)
|
||||||
|
{
|
||||||
|
sqe = ringloop->get_sqe();
|
||||||
|
}
|
||||||
|
if (!sqe)
|
||||||
|
{
|
||||||
|
int res = recvmsg(cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE|MSG_DONTWAIT);
|
||||||
|
if (res < 0)
|
||||||
|
{
|
||||||
|
res = -errno;
|
||||||
|
}
|
||||||
|
handle_zerocopy_notification(cl, res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||||
|
data->callback = [this, cl](ring_data_t *data) { handle_zerocopy_notification(cl, data->res); };
|
||||||
|
my_uring_prep_recvmsg(sqe, cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user