Compare commits
41 Commits
Author | SHA1 | Date | |
---|---|---|---|
85298ddae2 | |||
e23296a327 | |||
839ec9e6e0 | |||
7cbfdff41a | |||
951272f27f | |||
a3fb1d4c98 | |||
88402e6eb6 | |||
390239c51b | |||
b7b2adfa32 | |||
36c276358b | |||
117d6f0612 | |||
7d79c58095 | |||
46d2bc100f | |||
732e2804e9 | |||
abaec2008c | |||
8129d238a4 | |||
61ebed144a | |||
9d3ba113aa | |||
9788045dc9 | |||
d6b0d29af6 | |||
36f352f06f | |||
318cc463c2 | |||
145e5cfb86 | |||
73ae578981 | |||
20ee4ed758 | |||
63de79d1b2 | |||
f712967079 | |||
df0cd85352 | |||
ebaf4d7a72 | |||
d4bc10542c | |||
140309620a | |||
0a610ee943 | |||
f3ce166064 | |||
717d303370 | |||
d9857a5340 | |||
eb5d9153e8 | |||
ae6d1ed1d5 | |||
d123e58ea3 | |||
d9869d8116 | |||
4047ca606f | |||
218e294e9c |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.6.12")
|
||||
set(VERSION "0.6.15")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -55,11 +55,11 @@ Vitastor на данный момент находится в статусе п
|
||||
|
||||
## Планы развития
|
||||
|
||||
- Поддержка удаления снапшотов (слияния слоёв)
|
||||
- Более корректные скрипты разметки дисков и автоматического запуска OSD
|
||||
- Другие инструменты администрирования
|
||||
- Плагины для OpenNebula и других облачных систем
|
||||
- iSCSI-прокси
|
||||
- Упрощённый NFS прокси
|
||||
- Более быстрое переключение при отказах
|
||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||
- Контрольные суммы
|
||||
|
@@ -49,11 +49,11 @@ breaking changes in the future. However, the following is implemented:
|
||||
|
||||
## Roadmap
|
||||
|
||||
- Snapshot deletion (layer merge) support
|
||||
- Better OSD creation and auto-start tools
|
||||
- Other administrative tools
|
||||
- Plugins for OpenNebula, Proxmox and other cloud systems
|
||||
- Plugins for OpenNebula and other cloud systems
|
||||
- iSCSI proxy
|
||||
- Simplified NFS proxy
|
||||
- Faster failover
|
||||
- Scrubbing without checksums (verification of replicas)
|
||||
- Checksums
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.6.12
|
||||
VERSION ?= v0.6.15
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.6.12
|
||||
image: vitalif/vitastor-csi:v0.6.15
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.6.12
|
||||
image: vitalif/vitastor-csi:v0.6.15
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.6.12"
|
||||
vitastorCSIDriverVersion = "0.6.15"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (0.6.12-1) unstable; urgency=medium
|
||||
vitastor (0.6.15-1) unstable; urgency=medium
|
||||
|
||||
* RDMA support
|
||||
* Bugfixes
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -33,8 +33,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.6.12; \
|
||||
cd vitastor-0.6.12; \
|
||||
cp -r /root/vitastor vitastor-0.6.15; \
|
||||
cd vitastor-0.6.15; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -47,8 +47,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.12.orig.tar.xz vitastor-0.6.12; \
|
||||
cd vitastor-0.6.12; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.15.orig.tar.xz vitastor-0.6.15; \
|
||||
cd vitastor-0.6.15; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
35
docs/params/common.yml
Normal file
35
docs/params/common.yml
Normal file
@@ -0,0 +1,35 @@
|
||||
- name: config_path
|
||||
type: string
|
||||
default: "/etc/vitastor/vitastor.conf"
|
||||
info: |
|
||||
Path to the JSON configuration file. Configuration file is optional,
|
||||
a non-existing configuration file does not prevent Vitastor from
|
||||
running if required parameters are specified.
|
||||
info_ru: |
|
||||
Путь к файлу конфигурации в формате JSON. Файл конфигурации необязателен,
|
||||
без него Vitastor тоже будет работать, если переданы необходимые параметры.
|
||||
- name: etcd_address
|
||||
type: string or array of strings
|
||||
type_ru: строка или массив строк
|
||||
info: |
|
||||
etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
|
||||
specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
Note that https is not supported for etcd connections yet.
|
||||
info_ru: |
|
||||
Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
|
||||
или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
- name: etcd_prefix
|
||||
type: string
|
||||
default: "/vitastor"
|
||||
info: |
|
||||
Prefix for all keys in etcd used by Vitastor. You can change prefix and, for
|
||||
example, use a single etcd cluster for multiple Vitastor clusters.
|
||||
info_ru: |
|
||||
Префикс для ключей etcd, которые использует Vitastor. Вы можете задать другой
|
||||
префикс, например, чтобы запустить несколько кластеров Vitastor с одним
|
||||
кластером etcd.
|
||||
- name: log_level
|
||||
type: int
|
||||
default: 0
|
||||
info: Log level. Raise if you want more verbose output.
|
||||
info_ru: Уровень логгирования. Повысьте, если хотите более подробный вывод.
|
200
docs/params/layout-cluster.yml
Normal file
200
docs/params/layout-cluster.yml
Normal file
@@ -0,0 +1,200 @@
|
||||
- name: block_size
|
||||
type: int
|
||||
default: 131072
|
||||
info: |
|
||||
Size of objects (data blocks) into which all physical and virtual drives are
|
||||
subdivided in Vitastor. One of current main settings in Vitastor, affects
|
||||
memory usage, write amplification and I/O load distribution effectiveness.
|
||||
|
||||
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
|
||||
it's possible to use 4 MB for SSD too - it will lower memory usage, but
|
||||
may increase average WA and reduce linear performance.
|
||||
|
||||
OSDs with different block sizes (for example, SSD and SSD+HDD OSDs) can
|
||||
currently coexist in one etcd instance only within separate Vitastor
|
||||
clusters with different etcd_prefix'es.
|
||||
|
||||
Also block size can't be changed after OSD initialization without losing
|
||||
data.
|
||||
|
||||
You must always specify block_size in etcd in /vitastor/config/global if
|
||||
you change it so all clients can know about it.
|
||||
|
||||
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
|
||||
544 MB per 1 TB of used disk space with the default 128 KB block size.
|
||||
info_ru: |
|
||||
Размер объектов (блоков данных), на которые делятся физические и виртуальные
|
||||
диски в Vitastor. Одна из ключевых на данный момент настроек, влияет на
|
||||
потребление памяти, объём избыточной записи (write amplification) и
|
||||
эффективность распределения нагрузки по OSD.
|
||||
|
||||
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
|
||||
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
|
||||
это понизит использование памяти, но ухудшит распределение нагрузки и в
|
||||
среднем увеличит WA.
|
||||
|
||||
OSD с разными размерами блока (например, SSD и SSD+HDD OSD) на данный
|
||||
момент могут сосуществовать в рамках одного etcd только в виде двух независимых
|
||||
кластеров Vitastor с разными etcd_prefix.
|
||||
|
||||
Также размер блока нельзя менять после инициализации OSD без потери данных.
|
||||
|
||||
Если вы меняете размер блока, обязательно прописывайте его в etcd в
|
||||
/vitastor/config/global, дабы все клиенты его знали.
|
||||
|
||||
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
|
||||
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
|
||||
стандартном 128 КБ блоке.
|
||||
- name: bitmap_granularity
|
||||
type: int
|
||||
default: 4096
|
||||
info: |
|
||||
Required virtual disk write alignment ("sector size"). Must be a multiple
|
||||
of disk_alignment. It's called bitmap granularity because Vitastor tracks
|
||||
an allocation bitmap for each object containing 2 bits per each
|
||||
(bitmap_granularity) bytes.
|
||||
|
||||
This parameter can't be changed after OSD initialization without losing
|
||||
data. Also it's fixed for the whole Vitastor cluster i.e. two different
|
||||
values can't be used in a single Vitastor cluster.
|
||||
|
||||
Clients MUST be aware of this parameter value, so put it into etcd key
|
||||
/vitastor/config/global if you change it for any reason.
|
||||
info_ru: |
|
||||
Требуемое выравнивание записи на виртуальные диски (размер их "сектора").
|
||||
Должен быть кратен disk_alignment. Называется гранулярностью битовой карты
|
||||
потому, что Vitastor хранит битовую карту для каждого объекта, содержащую
|
||||
по 2 бита на каждые (bitmap_granularity) байт.
|
||||
|
||||
Данный параметр нельзя менять после инициализации OSD без потери данных.
|
||||
Также он фиксирован для всего кластера Vitastor, т.е. разные значения
|
||||
не могут сосуществовать в одном кластере.
|
||||
|
||||
Клиенты ДОЛЖНЫ знать правильное значение этого параметра, так что если вы
|
||||
его меняете, обязательно прописывайте изменённое значение в etcd в ключ
|
||||
/vitastor/config/global.
|
||||
- name: immediate_commit
|
||||
type: string
|
||||
default: false
|
||||
info: |
|
||||
Another parameter which is really important for performance.
|
||||
|
||||
Desktop SSDs are very fast (100000+ iops) for simple random writes
|
||||
without cache flush. However, they are really slow (only around 1000 iops)
|
||||
if you try to fsync() each write, that is, when you want to guarantee that
|
||||
each change gets immediately persisted to the physical media.
|
||||
|
||||
Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
|
||||
"Supercapacitor-based Power Loss Protection", on the other hand, are equally
|
||||
fast with and without fsync because their cache is protected from sudden
|
||||
power loss by a built-in supercapacitor-based "UPS".
|
||||
|
||||
Some software-defined storage systems always fsync each write and thus are
|
||||
really slow when used with desktop SSDs. Vitastor, however, can also
|
||||
efficiently utilize desktop SSDs by postponing fsync until the client calls
|
||||
it explicitly.
|
||||
|
||||
This is what this parameter regulates. When it's set to "all" the whole
|
||||
Vitastor cluster commits each change to disks immediately and clients just
|
||||
ignore fsyncs because they know for sure that they're unneeded. This reduces
|
||||
the amount of network roundtrips performed by clients and improves
|
||||
performance. So it's always better to use server grade SSDs with
|
||||
supercapacitors even with Vitastor, especially given that they cost only
|
||||
a bit more than desktop models.
|
||||
|
||||
There is also a common SATA SSD (and HDD too!) firmware bug (or feature)
|
||||
that makes server SSDs which have supercapacitors slow with fsync. To check
|
||||
if your SSDs are affected, compare benchmark results from `fio -name=test
|
||||
-ioengine=libaio -direct=1 -bs=4k -rw=randwrite -iodepth=1` with and without
|
||||
`-fsync=1`. Results should be the same. If fsync=1 result is worse you can
|
||||
try to work around this bug by "disabling" drive write-back cache by running
|
||||
`hdparm -W 0 /dev/sdXX` or `echo write through > /sys/block/sdXX/device/scsi_disk/*/cache_type`
|
||||
(IMPORTANT: don't mistake it with `/sys/block/sdXX/queue/write_cache` - it's
|
||||
unsafe to change by hand). The same may apply to newer HDDs with internal
|
||||
SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
|
||||
it (they have internal SSD cache even though it's not stated in datasheets).
|
||||
|
||||
This parameter must be set both in etcd in /vitastor/config/global and in
|
||||
OSD command line or configuration. Setting it to "all" or "small" requires
|
||||
enabling disable_journal_fsync and disable_meta_fsync, setting it to "all"
|
||||
also requires enabling disable_data_fsync.
|
||||
|
||||
TLDR: For optimal performance, set immediate_commit to "all" if you only use
|
||||
SSDs with supercapacitor-based power loss protection (nonvolatile
|
||||
write-through cache) for both data and journals in the whole Vitastor
|
||||
cluster. Set it to "small" if you only use such SSDs for journals. Leave
|
||||
empty if your drives have write-back cache.
|
||||
info_ru: |
|
||||
Ещё один важный для производительности параметр.
|
||||
|
||||
Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
|
||||
секунду) при простой случайной записи без сбросов кэша. Однако они очень
|
||||
медленные (всего порядка 1000 iops), если вы пытаетесь сбрасывать кэш после
|
||||
каждой записи, то есть, если вы пытаетесь гарантировать, что каждое
|
||||
изменение физически записывается в энергонезависимую память.
|
||||
|
||||
С другой стороны, серверные SSD с конденсаторами - функцией, называемой
|
||||
"Advanced/Enhanced Power Loss Protection" или просто "Supercapacitor-based
|
||||
Power Loss Protection" - одинаково быстрые и со сбросом кэша, и без
|
||||
него, потому что их кэш защищён от потери питания встроенным "источником
|
||||
бесперебойного питания" на основе суперконденсаторов и на самом деле они
|
||||
его никогда не сбрасывают.
|
||||
|
||||
Некоторые программные СХД всегда сбрасывают кэши дисков при каждой записи
|
||||
и поэтому работают очень медленно с настольными SSD. Vitastor, однако, может
|
||||
откладывать fsync до явного его вызова со стороны клиента и таким образом
|
||||
эффективно утилизировать настольные SSD.
|
||||
|
||||
Данный параметр влияет как раз на это. Когда он установлен в значение "all",
|
||||
весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
|
||||
носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
|
||||
знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
|
||||
по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
|
||||
использовать только серверные модели SSD с суперконденсаторами, особенно
|
||||
учитывая то, что стоят они ненамного дороже настольных.
|
||||
|
||||
Также в прошивках SATA SSD (и даже HDD!) очень часто встречается либо баг,
|
||||
либо просто особенность логики, из-за которой серверные SSD, имеющие
|
||||
конденсаторы и защиту от потери питания, всё равно медленно работают с
|
||||
fsync. Чтобы понять, подвержены ли этой проблеме ваши SSD, сравните
|
||||
результаты тестов `fio -name=test -ioengine=libaio -direct=1 -bs=4k
|
||||
-rw=randwrite -iodepth=1` без и с опцией `-fsync=1`. Результаты должны
|
||||
быть одинаковые. Если результат с `fsync=1` хуже, вы можете попробовать
|
||||
обойти проблему, "отключив" кэш записи диска командой `hdparm -W 0 /dev/sdXX`
|
||||
либо `echo write through > /sys/block/sdXX/device/scsi_disk/*/cache_type`
|
||||
(ВАЖНО: не перепутайте с `/sys/block/sdXX/queue/write_cache` - этот параметр
|
||||
менять руками небезопасно). Такая же проблема может встречаться и в новых
|
||||
HDD-дисках с внутренним SSD или "медиа" кэшем - например, она встречается во
|
||||
многих дисках Seagate EXOS (у них есть внутренний SSD-кэш, хотя это и не
|
||||
указано в спецификациях).
|
||||
|
||||
Данный параметр нужно указывать и в etcd в /vitastor/config/global, и в
|
||||
командной строке или конфигурации OSD. Значения "all" и "small" требуют
|
||||
включения disable_journal_fsync и disable_meta_fsync, значение "all" также
|
||||
требует включения disable_data_fsync.
|
||||
|
||||
Итого, вкратце: для оптимальной производительности установите
|
||||
immediate_commit в значение "all", если вы используете в кластере только SSD
|
||||
с суперконденсаторами и для данных, и для журналов. Если вы используете
|
||||
такие SSD для всех журналов, но не для данных - можете установить параметр
|
||||
в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
|
||||
оставьте параметр пустым.
|
||||
- name: client_dirty_limit
|
||||
type: int
|
||||
default: 33554432
|
||||
info: |
|
||||
Without immediate_commit=all this parameter sets the limit of "dirty"
|
||||
(not committed by fsync) data allowed by the client before forcing an
|
||||
additional fsync and committing the data. Also note that the client always
|
||||
holds a copy of uncommitted data in memory so this setting also affects
|
||||
RAM usage of clients.
|
||||
|
||||
This parameter doesn't affect OSDs themselves.
|
||||
info_ru: |
|
||||
При работе без immediate_commit=all - это лимит объёма "грязных" (не
|
||||
зафиксированных fsync-ом) данных, при достижении которого клиент будет
|
||||
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
|
||||
что в этом случае до момента fsync клиент хранит копию незафиксированных
|
||||
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
|
||||
|
||||
Параметр не влияет на сами OSD.
|
205
docs/params/layout-osd.yml
Normal file
205
docs/params/layout-osd.yml
Normal file
@@ -0,0 +1,205 @@
|
||||
- name: data_device
|
||||
type: string
|
||||
info: |
|
||||
Path to the block device to use for data. It's highly recommendded to use
|
||||
stable paths for all device names: `/dev/disk/by-partuuid/xxx...` instead
|
||||
of just `/dev/sda` or `/dev/nvme0n1` to not mess up after server restart.
|
||||
Files can also be used instead of block devices, but this is implemented
|
||||
only for testing purposes and not for production.
|
||||
info_ru: |
|
||||
Путь к диску (блочному устройству) для хранения данных. Крайне рекомендуется
|
||||
использовать стабильные пути: `/dev/disk/by-partuuid/xxx...` вместо простых
|
||||
`/dev/sda` или `/dev/nvme0n1`, чтобы пути не могли спутаться после
|
||||
перезагрузки сервера. Также вместо блочных устройств можно указывать файлы,
|
||||
но это реализовано только для тестирования, а не для боевой среды.
|
||||
- name: meta_device
|
||||
type: string
|
||||
info: |
|
||||
Path to the block device to use for the metadata. Metadata must be on a fast
|
||||
SSD or performance will suffer. If this option is skipped, `data_device` is
|
||||
used for the metadata.
|
||||
info_ru: |
|
||||
Путь к диску метаданных. Метаданные должны располагаться на быстром
|
||||
SSD-диске, иначе производительность пострадает. Если эта опция не указана,
|
||||
для метаданных используется `data_device`.
|
||||
- name: journal_device
|
||||
type: string
|
||||
info: |
|
||||
Path to the block device to use for the journal. Journal must be on a fast
|
||||
SSD or performance will suffer. If this option is skipped, `meta_device` is
|
||||
used for the journal, and if it's also empty, journal is put on
|
||||
`data_device`. It's almost always fine to put metadata and journal on the
|
||||
same device, in this case you only need to set `meta_device`.
|
||||
info_ru: |
|
||||
Путь к диску журнала. Журнал должен располагаться на быстром SSD-диске,
|
||||
иначе производительность пострадает. Если эта опция не указана,
|
||||
для журнала используется `meta_device`, если же пуста и она, журнал
|
||||
располагается на `data_device`. Нормально располагать журнал и метаданные
|
||||
на одном устройстве, в этом случае достаточно указать только `meta_device`.
|
||||
- name: journal_offset
|
||||
type: int
|
||||
default: 0
|
||||
info: Offset on the device in bytes where the journal is stored.
|
||||
info_ru: Смещение на устройстве в байтах, по которому располагается журнал.
|
||||
- name: journal_size
|
||||
type: int
|
||||
info: |
|
||||
Journal size in bytes. Doesn't have to be large, 16-32 MB is usually fine.
|
||||
By default, the whole journal device will be used for the journal. You must
|
||||
set it to some value manually (or use make-osd.sh) if you colocate the
|
||||
journal with data or metadata.
|
||||
info_ru: |
|
||||
Размер журнала в байтах. Большим быть не обязан, 16-32 МБ обычно достаточно.
|
||||
По умолчанию для журнала используется всё устройство журнала. Если же вы
|
||||
размещаете журнал на устройстве данных или метаданных, то вы должны
|
||||
установить эту опцию в какое-то значение сами (или использовать скрипт
|
||||
make-osd.sh).
|
||||
- name: meta_offset
|
||||
type: int
|
||||
default: 0
|
||||
info: |
|
||||
Offset on the device in bytes where the metadata area is stored.
|
||||
Again, set it to something if you colocate metadata with journal or data.
|
||||
info_ru: |
|
||||
Смещение на устройстве в байтах, по которому располагаются метаданные.
|
||||
Эту опцию нужно задать, если метаданные у вас хранятся на том же
|
||||
устройстве, что данные или журнал.
|
||||
- name: data_offset
|
||||
type: int
|
||||
default: 0
|
||||
info: |
|
||||
Offset on the device in bytes where the data area is stored.
|
||||
Again, set it to something if you colocate data with journal or metadata.
|
||||
info_ru: |
|
||||
Смещение на устройстве в байтах, по которому располагаются данные.
|
||||
Эту опцию нужно задать, если данные у вас хранятся на том же
|
||||
устройстве, что метаданные или журнал.
|
||||
- name: data_size
|
||||
type: int
|
||||
info: |
|
||||
Data area size in bytes. By default, the whole data device up to the end
|
||||
will be used for the data area, but you can restrict it if you want to use
|
||||
a smaller part. Note that there is no option to set metadata area size -
|
||||
it's derived from the data area size.
|
||||
info_ru: |
|
||||
Размер области данных в байтах. По умолчанию под данные будет использована
|
||||
вся доступная область устройства данных до конца устройства, но вы можете
|
||||
использовать эту опцию, чтобы ограничить её меньшим размером. Заметьте, что
|
||||
опции размера области метаданных нет - она вычисляется из размера области
|
||||
данных автоматически.
|
||||
- name: meta_block_size
|
||||
type: int
|
||||
default: 4096
|
||||
info: |
|
||||
Physical block size of the metadata device. 4096 for most current
|
||||
HDDs and SSDs.
|
||||
info_ru: |
|
||||
Размер физического блока устройства метаданных. 4096 для большинства
|
||||
современных SSD и HDD.
|
||||
- name: journal_block_size
|
||||
type: int
|
||||
default: 4096
|
||||
info: |
|
||||
Physical block size of the journal device. Must be a multiple of
|
||||
`disk_alignment`. 4096 for most current HDDs and SSDs.
|
||||
info_ru: |
|
||||
Размер физического блока устройства журнала. Должен быть кратен
|
||||
`disk_alignment`. 4096 для большинства современных SSD и HDD.
|
||||
- name: disable_data_fsync
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Do not issue fsyncs to the data device, i.e. do not flush its cache.
|
||||
Safe ONLY if your data device has write-through cache. If you disable
|
||||
the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
|
||||
that the cache disable command is run every time before starting Vitastor
|
||||
OSD, for example, in the systemd unit. See also `immediate_commit` option
|
||||
for the instructions to disable cache and how to benefit from it.
|
||||
info_ru: |
|
||||
Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
|
||||
Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
|
||||
записью (write-through). Если вы отключаете кэш через `hdparm` или
|
||||
`scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
|
||||
выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
|
||||
Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
|
||||
и о том, как из этого извлечь выгоду.
|
||||
- name: disable_meta_fsync
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Same as disable_data_fsync, but for the metadata device. If the metadata
|
||||
device is not set or if the data device is used for the metadata the option
|
||||
is ignored and disable_data_fsync value is used instead of it.
|
||||
info_ru: |
|
||||
То же, что disable_data_fsync, но для устройства метаданных. Если устройство
|
||||
метаданных не задано или если оно равно устройству данных, значение опции
|
||||
игнорируется и вместо него используется значение опции disable_data_fsync.
|
||||
- name: disable_journal_fsync
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Same as disable_data_fsync, but for the journal device. If the journal
|
||||
device is not set or if the metadata device is used for the journal the
|
||||
option is ignored and disable_meta_fsync value is used instead of it. If
|
||||
the same device is used for data, metadata and journal the option is also
|
||||
ignored and disable_data_fsync value is used instead of it.
|
||||
info_ru: |
|
||||
То же, что disable_data_fsync, но для устройства журнала. Если устройство
|
||||
журнала не задано или если оно равно устройству метаданных, значение опции
|
||||
игнорируется и вместо него используется значение опции disable_meta_fsync.
|
||||
Если одно и то же устройство используется и под данные, и под журнал, и под
|
||||
метаданные - значение опции также игнорируется и вместо него используется
|
||||
значение опции disable_data_fsync.
|
||||
- name: disable_device_lock
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Do not lock data, metadata and journal block devices exclusively with
|
||||
flock(). Though it's not recommended, but you can use it you want to run
|
||||
multiple OSD with a single device and different offsets, without using
|
||||
partitions.
|
||||
info_ru: |
|
||||
Не блокировать устройства данных, метаданных и журнала от открытия их
|
||||
другими OSD с помощью flock(). Так делать не рекомендуется, но теоретически
|
||||
вы можете это использовать, чтобы запускать несколько OSD на одном
|
||||
устройстве с разными смещениями и без использования разделов.
|
||||
- name: disk_alignment
|
||||
type: int
|
||||
default: 4096
|
||||
info: |
|
||||
Required physical disk write alignment. Most current SSD and HDD drives
|
||||
use 4 KB physical sectors even if they report 512 byte logical sector
|
||||
size, so 4 KB is a good default setting.
|
||||
|
||||
Note, however, that physical sector size also affects WA, because with block
|
||||
devices it's impossible to write anything smaller than a block. So, when
|
||||
Vitastor has to write a single metadata entry that's only about 32 bytes in
|
||||
size, it actually has to write the whole 4 KB sector.
|
||||
|
||||
Because of this it can actually be beneficial to use SSDs which work well
|
||||
with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
|
||||
and meta_block_size. But the only SSD that may fit into this category is
|
||||
Intel Optane (probably, not tested yet).
|
||||
|
||||
Clients don't need to be aware of disk_alignment, so it's not required to
|
||||
put a modified value into etcd key /vitastor/config/global.
|
||||
info_ru: |
|
||||
Требуемое выравнивание записи на физические диски. Почти все современные
|
||||
SSD и HDD диски используют 4 КБ физические секторы, даже если показывают
|
||||
логический размер сектора 512 байт, поэтому 4 КБ - хорошее значение по
|
||||
умолчанию.
|
||||
|
||||
Однако стоит понимать, что физический размер сектора тоже влияет на
|
||||
избыточную запись (WA), потому что ничего меньше блока (сектора) на блочное
|
||||
устройство записать невозможно. Таким образом, когда Vitastor-у нужно
|
||||
записать на диск всего лишь одну 32-байтную запись метаданных, фактически
|
||||
приходится перезаписывать 4 КБ сектор целиком.
|
||||
|
||||
Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
|
||||
меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
|
||||
journal_block_size и meta_block_size. Однако единственные SSD, которые
|
||||
теоретически могут попасть в эту категорию - это Intel Optane (но и это
|
||||
пока не проверялось автором).
|
||||
|
||||
Клиентам не обязательно знать про disk_alignment, так что помещать значение
|
||||
этого параметра в etcd в /vitastor/config/global не нужно.
|
65
docs/params/monitor.yml
Normal file
65
docs/params/monitor.yml
Normal file
@@ -0,0 +1,65 @@
|
||||
- name: etcd_mon_ttl
|
||||
type: sec
|
||||
min: 10
|
||||
default: 30
|
||||
info: Monitor etcd lease refresh interval in seconds
|
||||
info_ru: Интервал обновления etcd резервации (lease) монитором
|
||||
- name: etcd_mon_timeout
|
||||
type: ms
|
||||
default: 1000
|
||||
info: etcd request timeout used by monitor
|
||||
info_ru: Таймаут выполнения запросов к etcd от монитора
|
||||
- name: etcd_mon_retries
|
||||
type: int
|
||||
default: 5
|
||||
info: Maximum number of attempts for one monitor etcd request
|
||||
info_ru: Максимальное число попыток выполнения запросов к etcd монитором
|
||||
- name: mon_change_timeout
|
||||
type: ms
|
||||
min: 100
|
||||
default: 1000
|
||||
info: Optimistic retry interval for monitor etcd modification requests
|
||||
info_ru: Время повтора при коллизиях при запросах модификации в etcd, производимых монитором
|
||||
- name: mon_stats_timeout
|
||||
type: ms
|
||||
min: 100
|
||||
default: 1000
|
||||
info: |
|
||||
Interval for monitor to wait before updating aggregated statistics in
|
||||
etcd after receiving OSD statistics updates
|
||||
info_ru: |
|
||||
Интервал, который монитор ожидает при изменении статистики по отдельным
|
||||
OSD перед обновлением агрегированной статистики в etcd
|
||||
- name: osd_out_time
|
||||
type: sec
|
||||
default: 600
|
||||
info: |
|
||||
Time after which a failed OSD is removed from the data distribution.
|
||||
I.e. time which the monitor waits before attempting to restore data
|
||||
redundancy using other OSDs.
|
||||
info_ru: |
|
||||
Время, через которое отключенный OSD исключается из распределения данных.
|
||||
То есть, время, которое монитор ожидает перед попыткой переместить данные
|
||||
на другие OSD и таким образом восстановить избыточность хранения.
|
||||
- name: placement_levels
|
||||
type: json
|
||||
default: '`{"host":100,"osd":101}`'
|
||||
info: |
|
||||
Levels for the placement tree. You can define arbitrary tree levels by
|
||||
defining them in this parameter. The configuration parameter value should
|
||||
contain a JSON object with level names as keys and integer priorities as
|
||||
values. Smaller priority means higher level in tree. For example,
|
||||
"datacenter" should have smaller priority than "osd". "host" and "osd"
|
||||
levels are always predefined and can't be removed. If one of them is not
|
||||
present in the configuration, then it is defined with the default priority
|
||||
(100 for "host", 101 for "osd").
|
||||
info_ru: |
|
||||
Определения уровней для дерева размещения OSD. Вы можете определять
|
||||
произвольные уровни, помещая их в данный параметр конфигурации. Значение
|
||||
параметра должно содержать JSON-объект, ключи которого будут являться
|
||||
названиями уровней, а значения - целочисленными приоритетами. Меньшие
|
||||
приоритеты соответствуют верхним уровням дерева. Например, уровень
|
||||
"датацентр" должен иметь меньший приоритет, чем "OSD". Уровни с названиями
|
||||
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
|
||||
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
|
||||
умолчанию (100 для уровня "host", 101 для "osd").
|
225
docs/params/network.yml
Normal file
225
docs/params/network.yml
Normal file
@@ -0,0 +1,225 @@
|
||||
- name: tcp_header_buffer_size
|
||||
type: int
|
||||
default: 65536
|
||||
info: |
|
||||
Size of the buffer used to read data using an additional copy. Vitastor
|
||||
packet headers are 128 bytes, payload is always at least 4 KB, so it is
|
||||
usually beneficial to try to read multiple packets at once even though
|
||||
it requires to copy the data an additional time. The rest of each packet
|
||||
is received without an additional copy. You can try to play with this
|
||||
parameter and see how it affects random iops and linear bandwidth if you
|
||||
want.
|
||||
info_ru: |
|
||||
Размер буфера для чтения данных с дополнительным копированием. Пакеты
|
||||
Vitastor содержат 128-байтные заголовки, за которыми следуют данные размером
|
||||
от 4 КБ и для мелких операций ввода-вывода обычно выгодно за 1 вызов читать
|
||||
сразу несколько пакетов, даже не смотря на то, что это требует лишний раз
|
||||
скопировать данные. Часть каждого пакета за пределами значения данного
|
||||
параметра читается без дополнительного копирования. Вы можете попробовать
|
||||
поменять этот параметр и посмотреть, как он влияет на производительность
|
||||
случайного и линейного доступа.
|
||||
- name: use_sync_send_recv
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
If true, synchronous send/recv syscalls are used instead of io_uring for
|
||||
socket communication. Useless for OSDs because they require io_uring anyway,
|
||||
but may be required for clients with old kernel versions.
|
||||
info_ru: |
|
||||
Если установлено в истину, то вместо io_uring для передачи данных по сети
|
||||
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
||||
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
||||
принципе, это может применяться для клиентов со старыми версиями ядра.
|
||||
- name: use_rdma
|
||||
type: bool
|
||||
default: true
|
||||
info: |
|
||||
Try to use RDMA for communication if it's available. Disable if you don't
|
||||
want Vitastor to use RDMA. RDMA increases the performance, but TCP-only
|
||||
clients can still talk to an RDMA-enabled cluster, so you don't need to
|
||||
make sure that all clients support RDMA when enabling it.
|
||||
info_ru: |
|
||||
Пытаться использовать RDMA для связи при наличии доступных устройств.
|
||||
Отключите, если вы не хотите, чтобы Vitastor использовал RDMA.
|
||||
RDMA улучшает производительность, но
|
||||
Клиенты и клиентов and TCP-only clients in the cluster at the
|
||||
same time - TCP-only clients are still able to use an RDMA-enabled cluster.
|
||||
- name: rdma_device
|
||||
type: string
|
||||
info: |
|
||||
RDMA device name to use for Vitastor OSD communications (for example,
|
||||
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
||||
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
info_ru: |
|
||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
||||
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
- name: rdma_port_num
|
||||
type: int
|
||||
default: 1
|
||||
info: |
|
||||
RDMA device port number to use. Only for devices that have more than 1 port.
|
||||
See `phys_port_cnt` in `ibv_devinfo -v` output to determine how many ports
|
||||
your device has.
|
||||
info_ru: |
|
||||
Номер порта RDMA-устройства, который следует использовать. Имеет смысл
|
||||
только для устройств, у которых более 1 порта. Чтобы узнать, сколько портов
|
||||
у вашего адаптера, посмотрите `phys_port_cnt` в выводе команды
|
||||
`ibv_devinfo -v`.
|
||||
- name: rdma_gid_index
|
||||
type: int
|
||||
default: 0
|
||||
info: |
|
||||
Global address identifier index of the RDMA device to use. Different GID
|
||||
indexes may correspond to different protocols like RoCEv1, RoCEv2 and iWARP.
|
||||
Search for "GID" in `ibv_devinfo -v` output to determine which GID index
|
||||
you need.
|
||||
|
||||
**IMPORTANT:** If you want to use RoCEv2 (as recommended) then the correct
|
||||
rdma_gid_index is usually 1 (IPv6) or 3 (IPv4).
|
||||
info_ru: |
|
||||
Номер глобального идентификатора адреса RDMA-устройства, который следует
|
||||
использовать. Разным gid_index могут соответствовать разные протоколы связи:
|
||||
RoCEv1, RoCEv2, iWARP. Чтобы понять, какой нужен вам - смотрите строчки со
|
||||
словом "GID" в выводе команды `ibv_devinfo -v`.
|
||||
|
||||
**ВАЖНО:** Если вы хотите использовать RoCEv2 (как мы и рекомендуем), то
|
||||
правильный rdma_gid_index, как правило, 1 (IPv6) или 3 (IPv4).
|
||||
- name: rdma_mtu
|
||||
type: int
|
||||
default: 4096
|
||||
info: |
|
||||
RDMA Path MTU to use. Must be 1024, 2048 or 4096. There is usually no
|
||||
sense to change it from the default 4096.
|
||||
info_ru: |
|
||||
Максимальная единица передачи (Path MTU) для RDMA. Должно быть равно 1024,
|
||||
2048 или 4096. Обычно нет смысла менять значение по умолчанию, равное 4096.
|
||||
- name: rdma_max_sge
|
||||
type: int
|
||||
default: 128
|
||||
info: |
|
||||
Maximum number of scatter/gather entries to use for RDMA. OSDs negotiate
|
||||
the actual value when establishing connection anyway, so it's usually not
|
||||
required to change this parameter.
|
||||
info_ru: |
|
||||
Максимальное число записей разделения/сборки (scatter/gather) для RDMA.
|
||||
OSD в любом случае согласовывают реальное значение при установке соединения,
|
||||
так что менять этот параметр обычно не нужно.
|
||||
- name: rdma_max_msg
|
||||
type: int
|
||||
default: 1048576
|
||||
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||
- name: rdma_max_recv
|
||||
type: int
|
||||
default: 8
|
||||
info: |
|
||||
Maximum number of parallel RDMA receive operations. Note that this number
|
||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||
so this setting actually affects memory usage. This is because RDMA receive
|
||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||
later versions.
|
||||
info_ru: |
|
||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||
для каждого подключённого клиентского соединения, так что данная настройка
|
||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||
более новых версиях Vitastor.
|
||||
- name: peer_connect_interval
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
info: Interval before attempting to reconnect to an unavailable OSD.
|
||||
info_ru: Время ожидания перед повторной попыткой соединиться с недоступным OSD.
|
||||
- name: peer_connect_timeout
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
info: Timeout for OSD connection attempts.
|
||||
info_ru: Максимальное время ожидания попытки соединения с OSD.
|
||||
- name: osd_idle_timeout
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
info: |
|
||||
OSD connection inactivity time after which clients and other OSDs send
|
||||
keepalive requests to check state of the connection.
|
||||
info_ru: |
|
||||
Время неактивности соединения с OSD, после которого клиенты или другие OSD
|
||||
посылают запрос проверки состояния соединения.
|
||||
- name: osd_ping_timeout
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
info: |
|
||||
Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
|
||||
within this time, the connection to it is dropped and a reconnection attempt
|
||||
is scheduled.
|
||||
info_ru: |
|
||||
Максимальное время ожидания ответа на запрос проверки состояния соединения.
|
||||
Если OSD не отвечает за это время, соединение отключается и производится
|
||||
повторная попытка соединения.
|
||||
- name: up_wait_retry_interval
|
||||
type: ms
|
||||
min: 50
|
||||
default: 500
|
||||
info: |
|
||||
OSDs respond to clients with a special error code when they receive I/O
|
||||
requests for a PG that's not synchronized and started. This parameter sets
|
||||
the time for the clients to wait before re-attempting such I/O requests.
|
||||
info_ru: |
|
||||
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
||||
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
|
||||
они отвечают клиентам специальным кодом ошибки, означающим, что клиент
|
||||
должен некоторое время подождать перед повторением запроса. Именно это время
|
||||
ожидания задаёт данный параметр.
|
||||
- name: max_etcd_attempts
|
||||
type: int
|
||||
default: 5
|
||||
info: |
|
||||
Maximum number of attempts for etcd requests which can't be retried
|
||||
indefinitely.
|
||||
info_ru: |
|
||||
Максимальное число попыток выполнения запросов к etcd для тех запросов,
|
||||
которые нельзя повторять бесконечно.
|
||||
- name: etcd_quick_timeout
|
||||
type: ms
|
||||
default: 1000
|
||||
info: |
|
||||
Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||
info_ru: |
|
||||
Максимальное время выполнения запросов к etcd, которые должны завершаться
|
||||
быстро, таких, как обновление резервации (lease).
|
||||
- name: etcd_slow_timeout
|
||||
type: ms
|
||||
default: 5000
|
||||
info: Timeout for etcd requests which are allowed to wait for some time.
|
||||
info_ru: |
|
||||
Максимальное время выполнения запросов к etcd, для которых не обязательно
|
||||
гарантировать быстрое выполнение.
|
||||
- name: etcd_keepalive_timeout
|
||||
type: sec
|
||||
default: max(30, etcd_report_interval*2)
|
||||
info: |
|
||||
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
|
||||
etcd_report_interval to guarantee that keepalive actually works.
|
||||
info_ru: |
|
||||
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
|
||||
etcd_report_interval, чтобы keepalive гарантированно работал.
|
||||
- name: etcd_ws_keepalive_timeout
|
||||
type: sec
|
||||
default: 30
|
||||
info: |
|
||||
etcd websocket ping interval required to keep the connection alive and
|
||||
detect disconnections quickly.
|
||||
info_ru: |
|
||||
Интервал проверки живости вебсокет-подключений к etcd.
|
341
docs/params/osd.yml
Normal file
341
docs/params/osd.yml
Normal file
@@ -0,0 +1,341 @@
|
||||
- name: etcd_report_interval
|
||||
type: sec
|
||||
default: 5
|
||||
info: |
|
||||
Interval at which OSDs report their state to etcd. Affects OSD lease time
|
||||
and thus the failover speed. Lease time is equal to this parameter value
|
||||
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
|
||||
that every OSD always refreshes its lease in time.
|
||||
info_ru: |
|
||||
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
|
||||
влияет на время резервации (lease) OSD и поэтому на скорость переключения
|
||||
при падении OSD. Время lease равняется значению этого параметра плюс
|
||||
max_etcd_attempts * etcd_quick_timeout.
|
||||
- name: run_primary
|
||||
type: bool
|
||||
default: true
|
||||
info: |
|
||||
Start primary OSD logic on this OSD. As of now, can be turned off only for
|
||||
debugging purposes. It's possible to implement additional feature for the
|
||||
monitor which may allow to separate primary and secondary OSDs, but it's
|
||||
unclear why anyone could need it, so it's not implemented.
|
||||
info_ru: |
|
||||
Запускать логику первичного OSD на данном OSD. На данный момент отключать
|
||||
эту опцию может иметь смысл только в целях отладки. В теории, можно
|
||||
реализовать дополнительный режим для монитора, который позволит отделять
|
||||
первичные OSD от вторичных, но пока не понятно, зачем это может кому-то
|
||||
понадобиться, поэтому это не реализовано.
|
||||
- name: osd_network
|
||||
type: string or array of strings
|
||||
type_ru: строка или массив строк
|
||||
info: |
|
||||
Network mask of the network (IPv4 or IPv6) to use for OSDs. Note that
|
||||
although it's possible to specify multiple networks here, this does not
|
||||
mean that OSDs will create multiple listening sockets - they'll only
|
||||
pick the first matching address of an UP + RUNNING interface. Separate
|
||||
networks for cluster and client connections are also not implemented, but
|
||||
they are mostly useless anyway, so it's not a big deal.
|
||||
info_ru: |
|
||||
Маска подсети (IPv4 или IPv6) для использования для соединений с OSD.
|
||||
Имейте в виду, что хотя сейчас и можно передать в этот параметр несколько
|
||||
подсетей, это не означает, что OSD будут создавать несколько слушающих
|
||||
сокетов - они лишь будут выбирать адрес первого поднятого (состояние UP +
|
||||
RUNNING), подходящий под заданную маску. Также не реализовано разделение
|
||||
кластерной и публичной сетей OSD. Правда, от него обычно всё равно довольно
|
||||
мало толку, так что особенной проблемы в этом нет.
|
||||
- name: bind_address
|
||||
type: string
|
||||
default: "0.0.0.0"
|
||||
info: |
|
||||
Instead of the network mask, you can also set OSD listen address explicitly
|
||||
using this parameter. May be useful if you want to start OSDs on interfaces
|
||||
that are not UP + RUNNING.
|
||||
info_ru: |
|
||||
Этим параметром можно явным образом задать адрес, на котором будет ожидать
|
||||
соединений OSD (вместо использования маски подсети). Может быть полезно,
|
||||
например, чтобы запускать OSD на неподнятых интерфейсах (не UP + RUNNING).
|
||||
- name: bind_port
|
||||
type: int
|
||||
info: |
|
||||
By default, OSDs pick random ports to use for incoming connections
|
||||
automatically. With this option you can set a specific port for a specific
|
||||
OSD by hand.
|
||||
info_ru: |
|
||||
По умолчанию OSD сами выбирают случайные порты для входящих подключений.
|
||||
С помощью данной опции вы можете задать порт для отдельного OSD вручную.
|
||||
- name: autosync_interval
|
||||
type: sec
|
||||
default: 5
|
||||
info: |
|
||||
Time interval at which automatic fsyncs/flushes are issued by each OSD when
|
||||
the immediate_commit mode if disabled. fsyncs are required because without
|
||||
them OSDs quickly fill their journals, become unable to clear them and
|
||||
stall. Also this option limits the amount of recent uncommitted changes
|
||||
which OSDs may lose in case of a power outage in case when clients don't
|
||||
issue fsyncs at all.
|
||||
info_ru: |
|
||||
Временной интервал отправки автоматических fsync-ов (операций очистки кэша)
|
||||
каждым OSD для случая, когда режим immediate_commit отключён. fsync-и нужны
|
||||
OSD, чтобы успевать очищать журнал - без них OSD быстро заполняют журналы и
|
||||
перестают обрабатывать операции записи. Также эта опция ограничивает объём
|
||||
недавних незафиксированных изменений, которые OSD могут терять при
|
||||
отключении питания, если клиенты вообще не отправляют fsync.
|
||||
- name: autosync_writes
|
||||
type: int
|
||||
default: 128
|
||||
info: |
|
||||
Same as autosync_interval, but sets the maximum number of uncommitted write
|
||||
operations before issuing an fsync operation internally.
|
||||
info_ru: |
|
||||
Аналогично autosync_interval, но задаёт не временной интервал, а
|
||||
максимальное количество незафиксированных операций записи перед
|
||||
принудительной отправкой fsync-а.
|
||||
- name: recovery_queue_depth
|
||||
type: int
|
||||
default: 4
|
||||
info: |
|
||||
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
and rebalancing, but it's planned to implement more.
|
||||
info_ru: |
|
||||
Максимальное число операций восстановления на одном первичном OSD в любой
|
||||
момент времени. На данный момент единственный параметр, который можно менять
|
||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||
в планах реализация других параметров.
|
||||
- name: recovery_sync_batch
|
||||
type: int
|
||||
default: 16
|
||||
info: Maximum number of recovery operations before issuing an additional fsync.
|
||||
info_ru: Максимальное число операций восстановления перед дополнительным fsync.
|
||||
- name: readonly
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Read-only mode. If this is enabled, an OSD will never issue any writes to
|
||||
the underlying device. This may be useful for recovery purposes.
|
||||
info_ru: |
|
||||
Режим "только чтение". Если включить этот режим, OSD не будет писать ничего
|
||||
на диск. Может быть полезно в целях восстановления.
|
||||
- name: no_recovery
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Disable automatic background recovery of objects. Note that it doesn't
|
||||
affect implicit recovery of objects happening during writes - a write is
|
||||
always made to a full set of at least pg_minsize OSDs.
|
||||
info_ru: |
|
||||
Отключить автоматическое фоновое восстановление объектов. Обратите внимание,
|
||||
что эта опция не отключает восстановление объектов, происходящее при
|
||||
записи - запись всегда производится в полный набор из как минимум pg_minsize
|
||||
OSD.
|
||||
- name: no_rebalance
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Disable background movement of data between different OSDs. Disabling it
|
||||
means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||
info_ru: |
|
||||
Отключить фоновое перемещение объектов между разными OSD. Отключение
|
||||
означает, что PG, находящиеся в состоянии `has_misplaced`, будут оставлены
|
||||
в нём на неопределённый срок.
|
||||
- name: print_stats_interval
|
||||
type: sec
|
||||
default: 3
|
||||
info: |
|
||||
Time interval at which OSDs print simple human-readable operation
|
||||
statistics on stdout.
|
||||
info_ru: |
|
||||
Временной интервал, с которым OSD печатают простую человекочитаемую
|
||||
статистику выполнения операций в стандартный вывод.
|
||||
- name: slow_log_interval
|
||||
type: sec
|
||||
default: 10
|
||||
info: |
|
||||
Time interval at which OSDs dump slow or stuck operations on stdout, if
|
||||
they're any. Also it's the time after which an operation is considered
|
||||
"slow".
|
||||
info_ru: |
|
||||
Временной интервал, с которым OSD выводят в стандартный вывод список
|
||||
медленных или зависших операций, если таковые имеются. Также время, при
|
||||
превышении которого операция считается "медленной".
|
||||
- name: max_write_iodepth
|
||||
type: int
|
||||
default: 128
|
||||
info: |
|
||||
Parallel client write operation limit per one OSD. Operations that exceed
|
||||
this limit are pushed to a temporary queue instead of being executed
|
||||
immediately.
|
||||
info_ru: |
|
||||
Максимальное число одновременных клиентских операций записи на один OSD.
|
||||
Операции, превышающие этот лимит, не исполняются сразу, а сохраняются во
|
||||
временной очереди.
|
||||
- name: min_flusher_count
|
||||
type: int
|
||||
default: 1
|
||||
info: |
|
||||
Flusher is a micro-thread that moves data from the journal to the data
|
||||
area of the device. Their number is auto-tuned between minimum and maximum.
|
||||
Minimum number is set by this parameter.
|
||||
info_ru: |
|
||||
Flusher - это микро-поток (корутина), которая копирует данные из журнала в
|
||||
основную область устройства данных. Их число настраивается динамически между
|
||||
минимальным и максимальным значением. Этот параметр задаёт минимальное число.
|
||||
- name: max_flusher_count
|
||||
type: int
|
||||
default: 256
|
||||
info: |
|
||||
Maximum number of journal flushers (see above min_flusher_count).
|
||||
info_ru: |
|
||||
Максимальное число микро-потоков очистки журнала (см. выше min_flusher_count).
|
||||
- name: inmemory_metadata
|
||||
type: bool
|
||||
default: true
|
||||
info: |
|
||||
This parameter makes Vitastor always keep metadata area of the block device
|
||||
in memory. It's required for good performance because it allows to avoid
|
||||
additional read-modify-write cycles during metadata modifications. Metadata
|
||||
area size is currently roughly 224 MB per 1 TB of data. You can turn it off
|
||||
to reduce memory usage by this value, but it will hurt performance. This
|
||||
restriction is likely to be removed in the future along with the upgrade
|
||||
of the metadata storage scheme.
|
||||
info_ru: |
|
||||
Данный параметр заставляет Vitastor всегда держать область метаданных диска
|
||||
в памяти. Это нужно, чтобы избегать дополнительных операций чтения с диска
|
||||
при записи. Размер области метаданных на данный момент составляет примерно
|
||||
224 МБ на 1 ТБ данных. При включении потребление памяти снизится примерно
|
||||
на эту величину, но при этом также снизится и производительность. В будущем,
|
||||
после обновления схемы хранения метаданных, это ограничение, скорее всего,
|
||||
будет ликвидировано.
|
||||
- name: inmemory_journal
|
||||
type: bool
|
||||
default: true
|
||||
info: |
|
||||
This parameter make Vitastor always keep journal area of the block
|
||||
device in memory. Turning it off will, again, reduce memory usage, but
|
||||
hurt performance because flusher coroutines will have to read data from
|
||||
the disk back before copying it into the main area. The memory usage benefit
|
||||
is typically very small because it's sufficient to have 16-32 MB journal
|
||||
for SSD OSDs. However, in theory it's possible that you'll want to turn it
|
||||
off for hybrid (HDD+SSD) OSDs with large journals on quick devices.
|
||||
info_ru: |
|
||||
Данный параметр заставляет Vitastor всегда держать в памяти журналы OSD.
|
||||
Отключение параметра, опять же, снижает потребление памяти, но ухудшает
|
||||
производительность, так как для копирования данных из журнала в основную
|
||||
область устройства OSD будут вынуждены читать их обратно с диска. Выигрыш
|
||||
по памяти при этом обычно крайне низкий, так как для SSD OSD обычно
|
||||
достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
|
||||
параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
|
||||
журналами, расположенными на быстром по сравнению с HDD устройстве.
|
||||
- name: journal_sector_buffer_count
|
||||
type: int
|
||||
default: 32
|
||||
info: |
|
||||
Maximum number of buffers that can be used for writing journal metadata
|
||||
blocks. The only situation when you should increase it to a larger value
|
||||
is when you enable journal_no_same_sector_overwrites. In this case set
|
||||
it to, for example, 1024.
|
||||
info_ru: |
|
||||
Максимальное число буферов, разрешённых для использования под записываемые
|
||||
в журнал блоки метаданных. Единственная ситуация, в которой этот параметр
|
||||
нужно менять - это если вы включаете journal_no_same_sector_overwrites. В
|
||||
этом случае установите данный параметр, например, в 1024.
|
||||
- name: journal_no_same_sector_overwrites
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Enable this option for SSDs like Intel D3-S4510 and D3-S4610 which REALLY
|
||||
don't like when a program overwrites the same sector multiple times in a
|
||||
row and slow down significantly (from 25000+ iops to ~3000 iops). When
|
||||
this option is set, Vitastor will always move to the next sector of the
|
||||
journal after writing it instead of possibly overwriting it the second time.
|
||||
info_ru: |
|
||||
Включайте данную опцию для SSD вроде Intel D3-S4510 и D3-S4610, которые
|
||||
ОЧЕНЬ не любят, когда ПО перезаписывает один и тот же сектор несколько раз
|
||||
подряд. Такие SSD при многократной перезаписи одного и того же сектора
|
||||
сильно замедляются - условно, с 25000 и более iops до 3000 iops. Когда
|
||||
данная опция установлена, Vitastor всегда переходит к следующему сектору
|
||||
журнала после записи вместо потенциально повторной перезаписи того же
|
||||
самого сектора.
|
||||
- name: throttle_small_writes
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Enable soft throttling of small journaled writes. Useful for hybrid OSDs
|
||||
with fast journal/metadata devices and slow data devices. The idea is that
|
||||
small writes complete very quickly because they're first written to the
|
||||
journal device, but moving them to the main device is slow. So if an OSD
|
||||
allows clients to issue a lot of small writes it will perform very good
|
||||
for several seconds and then the journal will fill up and the performance
|
||||
will drop to almost zero. Throttling is meant to prevent this problem by
|
||||
artifically slowing quick writes down based on the amount of free space in
|
||||
the journal. When throttling is used, the performance of small writes will
|
||||
decrease smoothly instead of abrupt drop at the moment when the journal
|
||||
fills up.
|
||||
info_ru: |
|
||||
Разрешить мягкое ограничение скорости журналируемой записи. Полезно для
|
||||
гибридных OSD с быстрыми устройствами метаданных и медленными устройствами
|
||||
данных. Идея заключается в том, что мелкие записи в этой ситуации могут
|
||||
завершаться очень быстро, так как они изначально записываются на быстрое
|
||||
журнальное устройство (SSD). Но перемещать их потом на основное медленное
|
||||
устройство долго. Поэтому если OSD быстро примет от клиентов очень много
|
||||
мелких операций записи, он быстро заполнит свой журнал, после чего
|
||||
производительность записи резко упадёт практически до нуля. Ограничение
|
||||
скорости записи призвано решить эту проблему с помощью искусственного
|
||||
замедления операций записи на основании объёма свободного места в журнале.
|
||||
Когда эта опция включена, производительность мелких операций записи будет
|
||||
снижаться плавно, а не резко в момент окончательного заполнения журнала.
|
||||
- name: throttle_target_iops
|
||||
type: int
|
||||
default: 100
|
||||
info: |
|
||||
Target maximum number of throttled operations per second under the condition
|
||||
of full journal. Set it to approximate random write iops of your data devices
|
||||
(HDDs).
|
||||
info_ru: |
|
||||
Расчётное максимальное число ограничиваемых операций в секунду при условии
|
||||
отсутствия свободного места в журнале. Устанавливайте приблизительно равным
|
||||
максимальной производительности случайной записи ваших устройств данных
|
||||
(HDD) в операциях в секунду.
|
||||
- name: throttle_target_mbs
|
||||
type: int
|
||||
default: 100
|
||||
info: |
|
||||
Target maximum bandwidth in MB/s of throttled operations per second under
|
||||
the condition of full journal. Set it to approximate linear write
|
||||
performance of your data devices (HDDs).
|
||||
info_ru: |
|
||||
Расчётный максимальный размер в МБ/с ограничиваемых операций в секунду при
|
||||
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||
равным максимальной производительности линейной записи ваших устройств
|
||||
данных (HDD).
|
||||
- name: throttle_target_parallelism
|
||||
type: int
|
||||
default: 1
|
||||
info: |
|
||||
Target maximum parallelism of throttled operations under the condition of
|
||||
full journal. Set it to approximate internal parallelism of your data
|
||||
devices (1 for HDDs, 4-8 for SSDs).
|
||||
info_ru: |
|
||||
Расчётный максимальный параллелизм ограничиваемых операций в секунду при
|
||||
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||
равным внутреннему параллелизму ваших устройств данных (1 для HDD, 4-8
|
||||
для SSD).
|
||||
- name: throttle_threshold_us
|
||||
type: us
|
||||
default: 50
|
||||
info: |
|
||||
Minimal computed delay to be applied to throttled operations. Usually
|
||||
doesn't need to be changed.
|
||||
info_ru: |
|
||||
Минимальная применимая к ограничиваемым операциям задержка. Обычно не
|
||||
требует изменений.
|
||||
- name: osd_memlock
|
||||
type: bool
|
||||
default: false
|
||||
info: >
|
||||
Lock all OSD memory to prevent it from being unloaded into swap with
|
||||
mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||
info_ru: >
|
||||
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
|
||||
в пространство подкачки. Требует достаточного значения ulimit -l (лимита
|
||||
заблокированной памяти).
|
39
mon/mon.js
39
mon/mon.js
@@ -83,13 +83,13 @@ const etcd_tree = {
|
||||
osd_idle_timeout: 5, // seconds. min: 1
|
||||
osd_ping_timeout: 5, // seconds. min: 1
|
||||
up_wait_retry_interval: 500, // ms. min: 50
|
||||
// osd
|
||||
etcd_report_interval: 5, // seconds
|
||||
max_etcd_attempts: 5,
|
||||
etcd_quick_timeout: 1000, // ms
|
||||
etcd_slow_timeout: 5000, // ms
|
||||
etcd_keepalive_timeout: 30, // seconds, default is min(30, etcd_report_interval*2)
|
||||
etcd_keepalive_timeout: 30, // seconds, default is max(30, etcd_report_interval*2)
|
||||
etcd_ws_keepalive_interval: 30, // seconds
|
||||
// osd
|
||||
etcd_report_interval: 5, // seconds
|
||||
run_primary: true,
|
||||
osd_network: null, // "192.168.7.0/24" or an array of masks
|
||||
bind_address: "0.0.0.0",
|
||||
@@ -104,6 +104,7 @@ const etcd_tree = {
|
||||
no_rebalance: false,
|
||||
print_stats_interval: 3,
|
||||
slow_log_interval: 10,
|
||||
osd_memlock: false,
|
||||
// blockstore - fixed in superblock
|
||||
block_size,
|
||||
disk_alignment,
|
||||
@@ -130,6 +131,11 @@ const etcd_tree = {
|
||||
inmemory_journal,
|
||||
journal_sector_buffer_count,
|
||||
journal_no_same_sector_overwrites,
|
||||
throttle_small_writes: false,
|
||||
throttle_target_iops: 100,
|
||||
throttle_target_mbs: 100,
|
||||
throttle_target_parallelism: 1,
|
||||
throttle_threshold_us: 50,
|
||||
}, */
|
||||
global: {},
|
||||
/* node_placement: {
|
||||
@@ -1339,21 +1345,30 @@ class Mon
|
||||
const tm = prev_stats ? BigInt(timestamp - prev_stats.timestamp) : 0;
|
||||
for (const op in op_stats)
|
||||
{
|
||||
op_stats[op].bps = prev_stats ? (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm : 0;
|
||||
op_stats[op].iops = prev_stats ? (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm : 0;
|
||||
op_stats[op].lat = prev_stats ? (op_stats[op].usec - prev_stats.op_stats[op].usec)
|
||||
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n) : 0;
|
||||
if (prev_stats && prev_stats.op_stats && prev_stats.op_stats[op])
|
||||
{
|
||||
op_stats[op].bps = (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm;
|
||||
op_stats[op].iops = (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm;
|
||||
op_stats[op].lat = (op_stats[op].usec - prev_stats.op_stats[op].usec)
|
||||
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n);
|
||||
}
|
||||
}
|
||||
for (const op in subop_stats)
|
||||
{
|
||||
subop_stats[op].iops = prev_stats ? (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm : 0;
|
||||
subop_stats[op].lat = prev_stats ? (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
|
||||
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n) : 0;
|
||||
if (prev_stats && prev_stats.subop_stats && prev_stats.subop_stats[op])
|
||||
{
|
||||
subop_stats[op].iops = (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm;
|
||||
subop_stats[op].lat = (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
|
||||
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n);
|
||||
}
|
||||
}
|
||||
for (const op in recovery_stats)
|
||||
{
|
||||
recovery_stats[op].bps = prev_stats ? (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm : 0;
|
||||
recovery_stats[op].iops = prev_stats ? (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm : 0;
|
||||
if (prev_stats && prev_stats.recovery_stats && prev_stats.recovery_stats[op])
|
||||
{
|
||||
recovery_stats[op].bps = (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm;
|
||||
recovery_stats[op].iops = (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm;
|
||||
}
|
||||
}
|
||||
return { op_stats, subop_stats, recovery_stats };
|
||||
}
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.6.12'
|
||||
VERSION = '0.6.15'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -25,4 +25,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.6.12/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.12$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.6.15/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.15$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -34,7 +34,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.12.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.15.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.12
|
||||
Version: 0.6.15
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.12.el7.tar.gz
|
||||
Source0: vitastor-0.6.15.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -33,7 +33,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.12.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.15.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.12
|
||||
Version: 0.6.15
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.12.el8.tar.gz
|
||||
Source0: vitastor-0.6.15.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.6.12")
|
||||
add_definitions(-DVERSION="0.6.15")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -155,7 +155,7 @@ target_link_libraries(vitastor-nbd
|
||||
# vitastor-cli
|
||||
add_executable(vitastor-cli
|
||||
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp
|
||||
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
|
||||
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm_data.cpp cli_rm.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-cli
|
||||
vitastor_client
|
||||
|
@@ -1,3 +1,5 @@
|
||||
#include <sys/socket.h>
|
||||
#include <unistd.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <net/if.h>
|
||||
#include <sys/types.h>
|
||||
@@ -9,7 +11,7 @@
|
||||
|
||||
#include "addr_util.h"
|
||||
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr)
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr)
|
||||
{
|
||||
if (parse_port)
|
||||
{
|
||||
@@ -25,7 +27,7 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
|
||||
}
|
||||
if (inet_pton(AF_INET, str.c_str(), &((struct sockaddr_in*)addr)->sin_addr) == 1)
|
||||
{
|
||||
addr->sa_family = AF_INET;
|
||||
addr->ss_family = AF_INET;
|
||||
((struct sockaddr_in*)addr)->sin_port = htons(default_port);
|
||||
return true;
|
||||
}
|
||||
@@ -33,30 +35,30 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
|
||||
str = str.substr(1, str.length()-2);
|
||||
if (inet_pton(AF_INET6, str.c_str(), &((struct sockaddr_in6*)addr)->sin6_addr) == 1)
|
||||
{
|
||||
addr->sa_family = AF_INET6;
|
||||
addr->ss_family = AF_INET6;
|
||||
((struct sockaddr_in6*)addr)->sin6_port = htons(default_port);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string addr_to_string(const sockaddr &addr)
|
||||
std::string addr_to_string(const sockaddr_storage &addr)
|
||||
{
|
||||
char peer_str[256];
|
||||
bool ok = false;
|
||||
int port;
|
||||
if (addr.sa_family == AF_INET)
|
||||
if (addr.ss_family == AF_INET)
|
||||
{
|
||||
ok = !!inet_ntop(AF_INET, &((sockaddr_in*)&addr)->sin_addr, peer_str, 256);
|
||||
port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
}
|
||||
else if (addr.sa_family == AF_INET6)
|
||||
else if (addr.ss_family == AF_INET6)
|
||||
{
|
||||
ok = !!inet_ntop(AF_INET6, &((sockaddr_in6*)&addr)->sin6_addr, peer_str, 256);
|
||||
port = ntohs(((sockaddr_in6*)&addr)->sin6_port);
|
||||
}
|
||||
else
|
||||
throw std::runtime_error("Unknown address family "+std::to_string(addr.sa_family));
|
||||
throw std::runtime_error("Unknown address family "+std::to_string(addr.ss_family));
|
||||
if (!ok)
|
||||
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
|
||||
return std::string(peer_str)+":"+std::to_string(port);
|
||||
@@ -186,3 +188,51 @@ std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool
|
||||
freeifaddrs(list);
|
||||
return addresses;
|
||||
}
|
||||
|
||||
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port)
|
||||
{
|
||||
sockaddr_storage addr;
|
||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
||||
{
|
||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
||||
}
|
||||
|
||||
int listen_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (listen_fd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
||||
}
|
||||
int enable = 1;
|
||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
||||
|
||||
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
||||
}
|
||||
if (listening_port)
|
||||
{
|
||||
if (bind_port == 0)
|
||||
{
|
||||
socklen_t len = sizeof(addr);
|
||||
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
|
||||
}
|
||||
*listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
}
|
||||
else
|
||||
{
|
||||
*listening_port = bind_port;
|
||||
}
|
||||
}
|
||||
|
||||
if (listen(listen_fd, listen_backlog ? listen_backlog : 128) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
||||
}
|
||||
|
||||
return listen_fd;
|
||||
}
|
||||
|
@@ -4,6 +4,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr);
|
||||
std::string addr_to_string(const sockaddr &addr);
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
|
||||
std::string addr_to_string(const sockaddr_storage &addr);
|
||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
|
||||
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);
|
||||
|
@@ -21,7 +21,7 @@
|
||||
// Memory alignment for direct I/O (usually 512 bytes)
|
||||
// All other alignments must be a multiple of this one
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 512
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
// Default block size is 128 KB, current allowed range is 4K - 128M
|
||||
|
@@ -415,8 +415,11 @@ stop_flusher:
|
||||
flusher->active_flushers++;
|
||||
resume_1:
|
||||
// Find it in clean_db
|
||||
clean_it = bs->clean_db.find(cur.oid);
|
||||
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
{
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
}
|
||||
// Scan dirty versions of the object
|
||||
if (!scan_dirty(1))
|
||||
{
|
||||
@@ -870,10 +873,11 @@ void journal_flusher_co::update_clean_db()
|
||||
#endif
|
||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
if (has_delete)
|
||||
{
|
||||
auto clean_it = bs->clean_db.find(cur.oid);
|
||||
bs->clean_db.erase(clean_it);
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
clean_db.erase(clean_it);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
|
||||
clean_loc >> bs->block_order,
|
||||
@@ -884,7 +888,7 @@ void journal_flusher_co::update_clean_db()
|
||||
}
|
||||
else
|
||||
{
|
||||
bs->clean_db[cur.oid] = {
|
||||
clean_db[cur.oid] = {
|
||||
.version = cur.version,
|
||||
.location = clean_loc,
|
||||
};
|
||||
|
@@ -49,7 +49,6 @@ class journal_flusher_co
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||
|
||||
bool skip_copy, has_delete, has_writes;
|
||||
blockstore_clean_db_t::iterator clean_it;
|
||||
std::vector<copy_buffer_t> v;
|
||||
std::vector<copy_buffer_t>::iterator it;
|
||||
int copy_count;
|
||||
|
@@ -118,7 +118,7 @@ void blockstore_impl_t::loop()
|
||||
// has_writes == 0 - no writes before the current queue item
|
||||
// has_writes == 1 - some writes in progress
|
||||
// has_writes == 2 - tried to submit some writes, but failed
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
|
||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||
{
|
||||
auto op = submit_queue[op_idx];
|
||||
@@ -142,7 +142,6 @@ void blockstore_impl_t::loop()
|
||||
continue;
|
||||
}
|
||||
}
|
||||
unsigned ring_space = ringloop->space_left();
|
||||
unsigned prev_sqe_pos = ringloop->save();
|
||||
// 0 = can't submit
|
||||
// 1 = in progress
|
||||
@@ -199,9 +198,14 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
else if (op->opcode == BS_OP_LIST)
|
||||
{
|
||||
// LIST doesn't need to be blocked by previous modifications
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
// LIST doesn't have to be blocked by previous modifications
|
||||
// But don't do a lot of LISTs at once, because they're blocking and potentially slow
|
||||
if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
|
||||
{
|
||||
process_list(op);
|
||||
done_lists++;
|
||||
wr_st = 2;
|
||||
}
|
||||
}
|
||||
if (wr_st == 2)
|
||||
{
|
||||
@@ -212,7 +216,6 @@ void blockstore_impl_t::loop()
|
||||
ringloop->restore(prev_sqe_pos);
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
PRIV(op)->wait_detail = 1 + ring_space;
|
||||
// ring is full, stop submission
|
||||
break;
|
||||
}
|
||||
@@ -282,7 +285,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
if (ringloop->space_left() < PRIV(op)->wait_detail)
|
||||
if (ringloop->sqes_left() < PRIV(op)->wait_detail)
|
||||
{
|
||||
// stop submission if there's still no free space
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
@@ -425,22 +428,104 @@ static bool replace_stable(object_id oid, uint64_t version, int search_start, in
|
||||
return false;
|
||||
}
|
||||
|
||||
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||
{
|
||||
uint64_t pg_num = 0;
|
||||
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it != clean_db_settings.end())
|
||||
{
|
||||
// like map_to_pg()
|
||||
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||
}
|
||||
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||
}
|
||||
|
||||
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pool_id = (uint64_t)pool;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
// like map_to_pg()
|
||||
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
|
||||
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
||||
new_shards[shard_id][pair.first] = pair.second;
|
||||
}
|
||||
clean_db_shards.erase(sh_it++);
|
||||
}
|
||||
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
|
||||
{
|
||||
auto & to = clean_db_shards[sh_it->first];
|
||||
to.swap(sh_it->second);
|
||||
}
|
||||
clean_db_settings[pool_id] = (pool_shard_settings_t){
|
||||
.pg_count = pg_count,
|
||||
.pg_stripe_size = pg_stripe_size,
|
||||
};
|
||||
}
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->offset;
|
||||
uint32_t list_pg = op->offset+1;
|
||||
uint32_t pg_count = op->len;
|
||||
uint64_t pg_stripe_size = op->oid.stripe;
|
||||
uint64_t min_inode = op->oid.inode;
|
||||
uint64_t max_inode = op->version;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
// Copy clean_db entries (sorted)
|
||||
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
|
||||
// Check if the DB needs resharding
|
||||
// (we don't know about PGs from the beginning, we only create "shards" here)
|
||||
uint64_t first_shard = 0, last_shard = UINT64_MAX;
|
||||
if (min_inode != 0 &&
|
||||
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
|
||||
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
|
||||
if (pg_count > 1)
|
||||
{
|
||||
// Per-pg listing
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it == clean_db_settings.end() ||
|
||||
sh_it->second.pg_count != pg_count ||
|
||||
sh_it->second.pg_stripe_size != pg_stripe_size)
|
||||
{
|
||||
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
|
||||
}
|
||||
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-pool listing
|
||||
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
|
||||
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
|
||||
}
|
||||
}
|
||||
// Copy clean_db entries
|
||||
int stable_count = 0, stable_alloc = 0;
|
||||
if (min_inode != max_inode)
|
||||
{
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
@@ -448,7 +533,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
{
|
||||
@@ -463,26 +552,28 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
if (!pg_count || ((clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
if (stable_count >= stable_alloc)
|
||||
stable_alloc *= 2;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
stable_alloc += 32768;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
}
|
||||
}
|
||||
if (first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
// Copy dirty_db entries (sorted, too)
|
||||
int unstable_count = 0, unstable_alloc = 0;
|
||||
@@ -508,7 +599,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
for (; dirty_it != dirty_end; dirty_it++)
|
||||
{
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
|
||||
{
|
||||
if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
|
@@ -55,9 +55,10 @@
|
||||
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
|
||||
|
||||
#define BS_SUBMIT_CHECK_SQES(n) \
|
||||
if (ringloop->space_left() < (n))\
|
||||
if (ringloop->sqes_left() < (n))\
|
||||
{\
|
||||
/* Pause until there are more requests available */\
|
||||
PRIV(op)->wait_detail = (n);\
|
||||
PRIV(op)->wait_for = WAIT_SQE;\
|
||||
return 0;\
|
||||
}
|
||||
@@ -71,6 +72,7 @@
|
||||
if (!sqe)\
|
||||
{\
|
||||
/* Pause until there are more requests available */\
|
||||
PRIV(op)->wait_detail = 1;\
|
||||
PRIV(op)->wait_for = WAIT_SQE;\
|
||||
return 0;\
|
||||
}
|
||||
@@ -80,6 +82,7 @@
|
||||
if (!sqe)\
|
||||
{\
|
||||
/* Pause until there are more requests available */\
|
||||
PRIV(op)->wait_detail = 1;\
|
||||
PRIV(op)->wait_for = WAIT_SQE;\
|
||||
return 0;\
|
||||
}
|
||||
@@ -201,6 +204,17 @@ typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
|
||||
|
||||
#include "blockstore_flush.h"
|
||||
|
||||
typedef uint32_t pool_id_t;
|
||||
typedef uint64_t pool_pg_id_t;
|
||||
|
||||
#define POOL_ID_BITS 16
|
||||
|
||||
struct pool_shard_settings_t
|
||||
{
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
class blockstore_impl_t
|
||||
{
|
||||
/******* OPTIONS *******/
|
||||
@@ -238,11 +252,14 @@ class blockstore_impl_t
|
||||
int throttle_target_parallelism = 1;
|
||||
// Minimum difference in microseconds between target and real execution times to throttle the response
|
||||
int throttle_threshold_us = 50;
|
||||
// Maximum number of LIST operations to be processed between
|
||||
int single_tick_list_limit = 1;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
|
||||
blockstore_clean_db_t clean_db;
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
uint8_t *clean_bitmap = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
@@ -291,6 +308,9 @@ class blockstore_impl_t
|
||||
void open_journal();
|
||||
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||
|
||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
|
||||
// Journaling
|
||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
||||
|
@@ -222,10 +222,11 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
||||
}
|
||||
if (entry->oid.inode > 0)
|
||||
{
|
||||
auto clean_it = bs->clean_db.find(entry->oid);
|
||||
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
|
||||
auto & clean_db = bs->clean_db_shard(entry->oid);
|
||||
auto clean_it = clean_db.find(entry->oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
|
||||
{
|
||||
if (clean_it != bs->clean_db.end())
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
// free the previous block
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
@@ -245,7 +246,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
||||
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||
#endif
|
||||
bs->data_alloc->set(done_cnt+i, true);
|
||||
bs->clean_db[entry->oid] = (struct clean_entry){
|
||||
clean_db[entry->oid] = (struct clean_entry){
|
||||
.version = entry->version,
|
||||
.location = (done_cnt+i) << block_order,
|
||||
};
|
||||
@@ -656,8 +657,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
init_write_sector = proc_pos;
|
||||
return 0;
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->small_write.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
auto & clean_db = bs->clean_db_shard(je->small_write.oid);
|
||||
auto clean_it = clean_db.find(je->small_write.oid);
|
||||
if (clean_it == clean_db.end() ||
|
||||
clean_it->second.version < je->small_write.version)
|
||||
{
|
||||
obj_ver_id ov = {
|
||||
@@ -735,8 +737,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
erase_dirty_object(dirty_it);
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
auto & clean_db = bs->clean_db_shard(je->big_write.oid);
|
||||
auto clean_it = clean_db.find(je->big_write.oid);
|
||||
if (clean_it == clean_db.end() ||
|
||||
clean_it->second.version < je->big_write.version)
|
||||
{
|
||||
// oid, version, block
|
||||
@@ -841,8 +844,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
dirty_it--;
|
||||
dirty_exists = dirty_it->first.oid == je->del.oid;
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->del.oid);
|
||||
bool clean_exists = (clean_it != bs->clean_db.end() &&
|
||||
auto & clean_db = bs->clean_db_shard(je->del.oid);
|
||||
auto clean_it = clean_db.find(je->del.oid);
|
||||
bool clean_exists = (clean_it != clean_db.end() &&
|
||||
clean_it->second.version < je->del.version);
|
||||
if (!clean_exists && dirty_exists)
|
||||
{
|
||||
@@ -901,8 +905,9 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != bs->clean_db.end()
|
||||
auto & clean_db = bs->clean_db_shard(oid);
|
||||
auto clean_it = clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
if (exists && clean_loc == UINT64_MAX)
|
||||
{
|
||||
|
@@ -111,6 +111,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
|
||||
|
||||
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(read_op->oid);
|
||||
auto clean_it = clean_db.find(read_op->oid);
|
||||
auto dirty_it = dirty_db.upper_bound((obj_ver_id){
|
||||
.oid = read_op->oid,
|
||||
@@ -297,6 +298,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
|
||||
dirty_it--;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(oid);
|
||||
auto clean_it = clean_db.find(oid);
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
|
@@ -54,6 +54,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
auto dirty_it = dirty_db.find(*v);
|
||||
if (dirty_it == dirty_db.end())
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v->oid);
|
||||
auto clean_it = clean_db.find(v->oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
||||
{
|
||||
@@ -188,6 +189,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
}
|
||||
if (exists == -1)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
exists = clean_it != clean_db.end() ? 1 : 0;
|
||||
}
|
||||
@@ -215,6 +217,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
|
@@ -41,6 +41,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(op->oid);
|
||||
auto clean_it = clean_db.find(op->oid);
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
|
@@ -365,6 +365,13 @@ void cli_tool_t::run(json11::Json cfg)
|
||||
if (action_cb != NULL)
|
||||
ringloop->wait();
|
||||
}
|
||||
// Destroy the client
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
@@ -374,5 +381,6 @@ int main(int narg, const char *args[])
|
||||
exe_name = args[0];
|
||||
cli_tool_t *p = new cli_tool_t();
|
||||
p->run(cli_tool_t::parse_args(narg, args));
|
||||
delete p;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -108,9 +108,14 @@ resume_1:
|
||||
pool_avail = pg_free;
|
||||
}
|
||||
}
|
||||
if (pool_avail == UINT64_MAX)
|
||||
{
|
||||
pool_avail = 0;
|
||||
}
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
pool_avail = pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
||||
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
||||
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
|
||||
}
|
||||
pool_stats[pool_cfg.id] = json11::Json::object {
|
||||
{ "name", pool_cfg.name },
|
||||
@@ -189,11 +194,16 @@ resume_1:
|
||||
json11::Json::array list;
|
||||
for (auto & kv: pool_stats)
|
||||
{
|
||||
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
|
||||
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
|
||||
double raw_to = kv.second["raw_to_usable"].number_value();
|
||||
if (raw_to < 0.000001 && raw_to > -0.000001)
|
||||
raw_to = 1;
|
||||
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
|
||||
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
|
||||
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
|
||||
kv.second["used_pct"] = format_q(100 - 100*kv.second["max_available"].uint64_value() *
|
||||
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())+"%";
|
||||
kv.second["used_pct"] = format_q(kv.second["total_raw"].uint64_value()
|
||||
? (100 - 100*kv.second["max_available"].uint64_value() *
|
||||
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())
|
||||
: 100)+"%";
|
||||
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
|
||||
}
|
||||
printf("%s", print_table(to_list(), cols, parent->color).c_str());
|
||||
|
@@ -154,7 +154,7 @@ resume_1:
|
||||
if (pool_it != parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
auto & pool_cfg = pool_it->second;
|
||||
used_size = used_size / pool_pg_real_size[pool_id]
|
||||
used_size = used_size / (pool_pg_real_size[pool_id] ? pool_pg_real_size[pool_id] : 1)
|
||||
* (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
}
|
||||
auto stat_it = stats.find(inode_num);
|
||||
|
683
src/cli_rm.cpp
683
src/cli_rm.cpp
@@ -1,211 +1,566 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <fcntl.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
struct rm_pg_t
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||
// \ \ \- <child 2>
|
||||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0;
|
||||
cli_tool_t *parent;
|
||||
|
||||
// remove from..to
|
||||
std::string from_name, to_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
int current_child = 0;
|
||||
std::function<bool(void)> cb;
|
||||
|
||||
struct rm_inode_t
|
||||
{
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
int state = 0;
|
||||
|
||||
void start_delete()
|
||||
bool is_done()
|
||||
{
|
||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
return state == 9;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 9)
|
||||
goto resume_9;
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
});
|
||||
if (min_offset == 0)
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
resume_2:
|
||||
while (!cb())
|
||||
{
|
||||
total_count += objects.size();
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (object_id oid: objects)
|
||||
{
|
||||
if (oid.stripe >= min_offset)
|
||||
{
|
||||
total_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (parent->list_first)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
cb = NULL;
|
||||
parent->change_parent(merge_children[current_child], new_parent);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
resume_4:
|
||||
while (!cb())
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
resume_5:
|
||||
while (!cb())
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb())
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 9;
|
||||
resume_9:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
// Check that to_cfg is actually a child of from_cfg
|
||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->num != from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
// Initiate connection
|
||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
sources[item] = i--;
|
||||
}
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cur_list->obj_pos->stripe >= min_offset)
|
||||
if (!ic.second.parent_id)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
continue;
|
||||
}
|
||||
cur_list->obj_pos++;
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
if (parent->list_first && !lists_done)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
auto it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
if (!lists_done)
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
|
||||
state = 2;
|
||||
}
|
||||
}
|
||||
|
||||
bool loop()
|
||||
void read_stats()
|
||||
{
|
||||
if (state == 0)
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
start_delete();
|
||||
state = 1;
|
||||
return;
|
||||
}
|
||||
else if (state == 1)
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
continue_delete();
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
else if (state == 2)
|
||||
for (auto cp: sources)
|
||||
{
|
||||
return true;
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
return false;
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
exit(1);
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void choose_inverse_candidate()
|
||||
{
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::array cmp = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
};
|
||||
json11::Json::array txn = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||
{ "value", base64_encode(json11::Json({
|
||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||
}).dump()) },
|
||||
} },
|
||||
},
|
||||
};
|
||||
// Reparent children of inverse_child
|
||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cp.second.parent_id == child_cfg->num)
|
||||
{
|
||||
auto cp_cfg = cp.second;
|
||||
cp_cfg.parent_id = inverse_parent;
|
||||
auto cp_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||
);
|
||||
cmp.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cp_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cp.second.mod_revision+1 },
|
||||
});
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cp_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Parent (%s), child (%s), or one of its children"
|
||||
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
{
|
||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||
std::string cur_name = cur_cfg->name;
|
||||
std::string cur_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||
);
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, [this, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
||||
{
|
||||
auto remover = new rm_inode_t();
|
||||
remover->parent = this;
|
||||
remover->inode = cfg["inode"].uint64_value();
|
||||
remover->pool_id = cfg["pool"].uint64_value();
|
||||
if (remover->pool_id)
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto snap_remover = new snap_remover_t();
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
if (snap_remover->from_name == "")
|
||||
{
|
||||
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
if (!remover->pool_id)
|
||||
{
|
||||
fprintf(stderr, "pool is missing\n");
|
||||
fprintf(stderr, "Layer to remove argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||
return [remover]()
|
||||
if (snap_remover->to_name == "")
|
||||
{
|
||||
if (remover->loop())
|
||||
snap_remover->to_name = snap_remover->from_name;
|
||||
}
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover]()
|
||||
{
|
||||
snap_remover->loop();
|
||||
if (snap_remover->is_done())
|
||||
{
|
||||
delete remover;
|
||||
delete snap_remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
214
src/cli_rm_data.cpp
Normal file
214
src/cli_rm_data.cpp
Normal file
@@ -0,0 +1,214 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
struct rm_inode_t
|
||||
{
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
int state = 0;
|
||||
|
||||
void start_delete()
|
||||
{
|
||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
});
|
||||
if (min_offset == 0)
|
||||
{
|
||||
total_count += objects.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
for (object_id oid: objects)
|
||||
{
|
||||
if (oid.stripe >= min_offset)
|
||||
{
|
||||
total_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (parent->list_first)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
{
|
||||
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
|
||||
exit(1);
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
if (cur_list->obj_pos->stripe >= min_offset)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
}
|
||||
cur_list->obj_pos++;
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
if (parent->list_first && !lists_done)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
if (!lists_done)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
|
||||
state = 2;
|
||||
}
|
||||
}
|
||||
|
||||
bool loop()
|
||||
{
|
||||
if (state == 0)
|
||||
{
|
||||
start_delete();
|
||||
state = 1;
|
||||
}
|
||||
else if (state == 1)
|
||||
{
|
||||
continue_delete();
|
||||
}
|
||||
else if (state == 2)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
{
|
||||
auto remover = new rm_inode_t();
|
||||
remover->parent = this;
|
||||
remover->inode = cfg["inode"].uint64_value();
|
||||
remover->pool_id = cfg["pool"].uint64_value();
|
||||
if (remover->pool_id)
|
||||
{
|
||||
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
if (!remover->pool_id)
|
||||
{
|
||||
fprintf(stderr, "pool is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||
return [remover]()
|
||||
{
|
||||
if (remover->loop())
|
||||
{
|
||||
delete remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -1,568 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <fcntl.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||
// \ \ \- <child 2>
|
||||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
// remove from..to
|
||||
std::string from_name, to_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
int state = 0;
|
||||
int current_child = 0;
|
||||
std::function<bool(void)> cb;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 9;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 9)
|
||||
goto resume_9;
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
resume_2:
|
||||
while (!cb())
|
||||
{
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
parent->change_parent(merge_children[current_child], new_parent);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
resume_4:
|
||||
while (!cb())
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
resume_5:
|
||||
while (!cb())
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb())
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 9;
|
||||
resume_9:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
// Check that to_cfg is actually a child of from_cfg
|
||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->num != from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
sources[item] = i--;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (!ic.second.parent_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void read_stats()
|
||||
{
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
for (auto cp: sources)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
exit(1);
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void choose_inverse_candidate()
|
||||
{
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::array cmp = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
};
|
||||
json11::Json::array txn = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||
{ "value", base64_encode(json11::Json({
|
||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||
}).dump()) },
|
||||
} },
|
||||
},
|
||||
};
|
||||
// Reparent children of inverse_child
|
||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cp.second.parent_id == child_cfg->num)
|
||||
{
|
||||
auto cp_cfg = cp.second;
|
||||
cp_cfg.parent_id = inverse_parent;
|
||||
auto cp_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||
);
|
||||
cmp.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cp_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cp.second.mod_revision+1 },
|
||||
});
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cp_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Parent (%s), child (%s), or one of its children"
|
||||
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
{
|
||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||
std::string cur_name = cur_cfg->name;
|
||||
std::string cur_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||
);
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, [this, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto snap_remover = new snap_remover_t();
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
if (snap_remover->from_name == "")
|
||||
{
|
||||
fprintf(stderr, "Layer to remove argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (snap_remover->to_name == "")
|
||||
{
|
||||
snap_remover->to_name = snap_remover->from_name;
|
||||
}
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover]()
|
||||
{
|
||||
snap_remover->loop();
|
||||
if (snap_remover->is_done())
|
||||
{
|
||||
delete snap_remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -143,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
}
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||
{
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||
{
|
||||
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
|
||||
{
|
||||
@@ -151,7 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
}
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
// Flushes are always in the beginning
|
||||
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -172,6 +172,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
if (!next->prev_wait)
|
||||
{
|
||||
if (next->opcode == OSD_OP_SYNC)
|
||||
@@ -191,6 +192,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
if (!next->prev_wait)
|
||||
{
|
||||
if (next->opcode == OSD_OP_SYNC)
|
||||
|
@@ -200,7 +200,8 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num];
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = msgr.osd_peer_fds.at(cur_list->osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
|
@@ -13,6 +13,7 @@
|
||||
epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
|
||||
{
|
||||
this->ringloop = ringloop;
|
||||
this->pending = false;
|
||||
|
||||
epoll_fd = epoll_create(1);
|
||||
if (epoll_fd < 0)
|
||||
@@ -22,11 +23,19 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
|
||||
|
||||
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
|
||||
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
if (pending)
|
||||
handle_epoll_events();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
|
||||
handle_epoll_events();
|
||||
}
|
||||
|
||||
epoll_manager_t::~epoll_manager_t()
|
||||
{
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
if (tfd)
|
||||
{
|
||||
delete tfd;
|
||||
@@ -64,8 +73,13 @@ void epoll_manager_t::handle_epoll_events()
|
||||
io_uring_sqe *sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
|
||||
// Don't handle epoll events until we manage to post the next event handler
|
||||
// otherwise we'll fall out of sync with EPOLLET
|
||||
pending = true;
|
||||
ringloop->wakeup();
|
||||
return;
|
||||
}
|
||||
pending = false;
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
||||
data->callback = [this](ring_data_t *data)
|
||||
|
@@ -11,6 +11,8 @@
|
||||
class epoll_manager_t
|
||||
{
|
||||
int epoll_fd;
|
||||
bool pending;
|
||||
ring_consumer_t consumer;
|
||||
ring_loop_t *ringloop;
|
||||
std::map<int, std::function<void(int, int)>> epoll_handlers;
|
||||
public:
|
||||
|
@@ -351,9 +351,9 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("+++ %s 0x%lx 0x%llx+%llx\n",
|
||||
printf("+++ %s 0x%lx 0x%llx+%lx\n",
|
||||
io->ddir == DDIR_READ ? "READ" : "WRITE",
|
||||
(uint64_t)io, io->offset, io->xfer_buflen);
|
||||
(uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,4 +1,3 @@
|
||||
extern "C" {
|
||||
// Kill atomics in fio headers
|
||||
#define _STDATOMIC_H
|
||||
#include "fio/arch/arch.h"
|
||||
@@ -11,6 +10,7 @@ extern "C" {
|
||||
#define CONFIG_HAVE_GETTID
|
||||
#define CONFIG_SYNC_FILE_RANGE
|
||||
#define CONFIG_PWRITEV2
|
||||
extern "C" {
|
||||
#include "fio/fio.h"
|
||||
#include "fio/optgroup.h"
|
||||
}
|
||||
|
@@ -170,14 +170,14 @@ static int sec_init(struct thread_data *td)
|
||||
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
|
||||
bsd->block_size = 1 << o->block_order;
|
||||
|
||||
sockaddr addr;
|
||||
sockaddr_storage addr;
|
||||
if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr))
|
||||
{
|
||||
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
|
||||
return 1;
|
||||
}
|
||||
|
||||
bsd->connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
bsd->connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (bsd->connect_fd < 0)
|
||||
{
|
||||
perror("socket");
|
||||
@@ -192,11 +192,16 @@ static int sec_init(struct thread_data *td)
|
||||
setsockopt(bsd->connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
if (o->zerocopy_send)
|
||||
{
|
||||
#ifndef SO_ZEROCOPY
|
||||
perror("zerocopy send not supported on your system (socket.h misses SO_ZEROCOPY)");
|
||||
return 1;
|
||||
#else
|
||||
if (setsockopt(bsd->connect_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) < 0)
|
||||
{
|
||||
perror("setsockopt zerocopy");
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// FIXME: read config (block size) from OSD
|
||||
@@ -306,7 +311,13 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
||||
wtotal += io->xfer_buflen;
|
||||
}
|
||||
if (sendv_blocking(bsd->connect_fd, iov, iovcnt, opt->zerocopy_send ? MSG_ZEROCOPY : 0) != wtotal)
|
||||
if (sendv_blocking(bsd->connect_fd, iov, iovcnt,
|
||||
#ifdef SO_ZEROCOPY
|
||||
opt->zerocopy_send ? MSG_ZEROCOPY : 0
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
) != wtotal)
|
||||
{
|
||||
perror("sendmsg");
|
||||
exit(1);
|
||||
@@ -344,7 +355,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||
{
|
||||
if (reply.hdr.retval != io->xfer_buflen)
|
||||
{
|
||||
fprintf(stderr, "Short read: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
|
||||
fprintf(stderr, "Short read: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||
exit(1);
|
||||
}
|
||||
// Support bitmap
|
||||
@@ -369,7 +380,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||
{
|
||||
if (reply.hdr.retval != io->xfer_buflen)
|
||||
{
|
||||
fprintf(stderr, "Short write: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
|
||||
fprintf(stderr, "Short write: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
@@ -62,9 +62,10 @@ struct http_co_t
|
||||
void run_cb_and_clear();
|
||||
void start_connection();
|
||||
void close_connection();
|
||||
void next_request();
|
||||
void handle_events();
|
||||
void handle_connect_result();
|
||||
void submit_read();
|
||||
void submit_read(bool check_timeout);
|
||||
void submit_send();
|
||||
bool handle_read();
|
||||
void post_message(int type, const std::string & msg);
|
||||
@@ -128,6 +129,7 @@ void http_co_t::run_cb_and_clear()
|
||||
// Call callback after clearing it because otherwise we may hit reenterability problems
|
||||
if (cb != NULL)
|
||||
cb(&parsed);
|
||||
next_request();
|
||||
}
|
||||
|
||||
void http_co_t::send_request(const std::string & host, const std::string & request,
|
||||
@@ -161,17 +163,6 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
||||
this->sent = 0;
|
||||
this->response_callback = response_callback;
|
||||
this->parsed = {};
|
||||
if (request_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||
{
|
||||
stackin();
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
});
|
||||
}
|
||||
if (state == HTTP_CO_KEEPALIVE)
|
||||
{
|
||||
state = HTTP_CO_SENDING_REQUEST;
|
||||
@@ -181,6 +172,28 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
||||
{
|
||||
start_connection();
|
||||
}
|
||||
// Do it _after_ state assignment because set_timer() can actually trigger
|
||||
// other timers and requests (reenterability is our friend)
|
||||
if (request_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||
{
|
||||
stackin();
|
||||
if (state == HTTP_CO_REQUEST_SENT)
|
||||
{
|
||||
// In case of high CPU load, we may not handle etcd responses in time
|
||||
// For this case, first check the socket and only then terminate request with the timeout
|
||||
submit_read(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
}
|
||||
stackout();
|
||||
});
|
||||
}
|
||||
stackout();
|
||||
}
|
||||
|
||||
@@ -271,17 +284,19 @@ void http_co_t::close_connection()
|
||||
void http_co_t::start_connection()
|
||||
{
|
||||
stackin();
|
||||
struct sockaddr addr;
|
||||
struct sockaddr_storage addr;
|
||||
if (!string_to_addr(host.c_str(), 1, 80, &addr))
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = "Invalid address: "+host };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
return;
|
||||
}
|
||||
peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (peer_fd < 0)
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = std::string("socket: ")+strerror(errno) };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
@@ -323,7 +338,7 @@ void http_co_t::handle_events()
|
||||
epoll_events &= ~EPOLLOUT;
|
||||
if (epoll_events & EPOLLIN)
|
||||
{
|
||||
submit_read();
|
||||
submit_read(false);
|
||||
}
|
||||
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
|
||||
{
|
||||
@@ -410,10 +425,11 @@ again:
|
||||
stackout();
|
||||
}
|
||||
|
||||
void http_co_t::submit_read()
|
||||
void http_co_t::submit_read(bool check_timeout)
|
||||
{
|
||||
stackin();
|
||||
int res;
|
||||
again:
|
||||
if (rbuf.size() != READ_BUFFER_SIZE)
|
||||
{
|
||||
rbuf.resize(READ_BUFFER_SIZE);
|
||||
@@ -428,7 +444,22 @@ void http_co_t::submit_read()
|
||||
}
|
||||
if (res == -EAGAIN || res == -EINTR)
|
||||
{
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
if (check_timeout)
|
||||
{
|
||||
if (res == -EINTR)
|
||||
goto again;
|
||||
else
|
||||
{
|
||||
// Timeout happened and there is no data to read
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
}
|
||||
}
|
||||
else if (res <= 0)
|
||||
{
|
||||
@@ -501,8 +532,11 @@ bool http_co_t::handle_read()
|
||||
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
|
||||
{
|
||||
std::swap(parsed.body, response);
|
||||
response_callback(&parsed);
|
||||
parsed.eof = true;
|
||||
if (!keepalive)
|
||||
close_connection();
|
||||
else
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
run_cb_and_clear();
|
||||
}
|
||||
else if (state == HTTP_CO_CHUNKED && response.size() > 0)
|
||||
{
|
||||
@@ -533,10 +567,14 @@ bool http_co_t::handle_read()
|
||||
response_callback(&parsed);
|
||||
parsed.body = "";
|
||||
}
|
||||
if (parsed.eof && !want_streaming)
|
||||
else if (parsed.eof)
|
||||
{
|
||||
// Normal response
|
||||
response_callback(&parsed);
|
||||
if (!keepalive)
|
||||
close_connection();
|
||||
else
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
run_cb_and_clear();
|
||||
}
|
||||
}
|
||||
else if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
|
||||
@@ -547,29 +585,20 @@ bool http_co_t::handle_read()
|
||||
parsed.body = "";
|
||||
}
|
||||
}
|
||||
if (parsed.eof)
|
||||
{
|
||||
response_callback = NULL;
|
||||
parsed = {};
|
||||
if (!keepalive)
|
||||
{
|
||||
close_connection();
|
||||
}
|
||||
else
|
||||
{
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
if (keepalive_queue.size() > 0)
|
||||
{
|
||||
auto next = keepalive_queue[0];
|
||||
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
||||
next();
|
||||
}
|
||||
}
|
||||
}
|
||||
stackout();
|
||||
return true;
|
||||
}
|
||||
|
||||
void http_co_t::next_request()
|
||||
{
|
||||
if (keepalive_queue.size() > 0)
|
||||
{
|
||||
auto next = keepalive_queue[0];
|
||||
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t stoull_full(const std::string & str, int base)
|
||||
{
|
||||
if (isspace(str[0]))
|
||||
|
@@ -222,13 +222,13 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
|
||||
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
|
||||
{
|
||||
assert(peer_osd != this->osd_num);
|
||||
struct sockaddr addr;
|
||||
struct sockaddr_storage addr;
|
||||
if (!string_to_addr(peer_host, 0, peer_port, &addr))
|
||||
{
|
||||
on_connect_peer(peer_osd, -EINVAL);
|
||||
return;
|
||||
}
|
||||
int peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
int peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (peer_fd < 0)
|
||||
{
|
||||
on_connect_peer(peer_osd, -errno);
|
||||
@@ -484,10 +484,10 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
void osd_messenger_t::accept_connections(int listen_fd)
|
||||
{
|
||||
// Accept new connections
|
||||
sockaddr addr;
|
||||
sockaddr_storage addr;
|
||||
socklen_t peer_addr_size = sizeof(addr);
|
||||
int peer_fd;
|
||||
while ((peer_fd = accept(listen_fd, &addr, &peer_addr_size)) >= 0)
|
||||
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
|
||||
{
|
||||
assert(peer_fd != 0);
|
||||
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
||||
|
@@ -49,7 +49,7 @@ struct osd_client_t
|
||||
{
|
||||
int refs = 0;
|
||||
|
||||
sockaddr peer_addr;
|
||||
sockaddr_storage peer_addr;
|
||||
int peer_port;
|
||||
int peer_fd;
|
||||
int peer_state;
|
||||
|
@@ -111,6 +111,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
{
|
||||
delete cl->read_op;
|
||||
}
|
||||
else
|
||||
{
|
||||
cancel_op(cl->read_op);
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
if (cl->osd_num)
|
||||
|
@@ -55,6 +55,15 @@ protected:
|
||||
iovec read_iov = { 0 };
|
||||
|
||||
public:
|
||||
~nbd_proxy()
|
||||
{
|
||||
if (recv_buf)
|
||||
{
|
||||
free(recv_buf);
|
||||
recv_buf = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
@@ -322,6 +331,9 @@ public:
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
void load_module()
|
||||
@@ -351,7 +363,8 @@ public:
|
||||
setsid();
|
||||
if (fork())
|
||||
exit(0);
|
||||
chdir("/");
|
||||
if (chdir("/") != 0)
|
||||
fprintf(stderr, "Warning: Failed to chdir into /\n");
|
||||
close(0);
|
||||
close(1);
|
||||
close(2);
|
||||
@@ -498,7 +511,7 @@ protected:
|
||||
goto end_unmap;
|
||||
}
|
||||
ioctl(nbd, NBD_SET_FLAGS, flags);
|
||||
if (timeout >= 0)
|
||||
if (timeout > 0)
|
||||
{
|
||||
r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)timeout);
|
||||
if (r < 0)
|
||||
@@ -513,7 +526,11 @@ protected:
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
write(qd_fd, "32768", 5);
|
||||
r = write(qd_fd, "32768", 5);
|
||||
if (r != 5)
|
||||
{
|
||||
fprintf(stderr, "Warning: Failed to configure max_sectors_kb\n");
|
||||
}
|
||||
close(qd_fd);
|
||||
if (!fork())
|
||||
{
|
||||
|
56
src/osd.cpp
56
src/osd.cpp
@@ -3,6 +3,7 @@
|
||||
|
||||
#include <sys/socket.h>
|
||||
#include <sys/poll.h>
|
||||
#include <sys/mman.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <arpa/inet.h>
|
||||
@@ -53,6 +54,20 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||
autosync_writes = max_autosync;
|
||||
}
|
||||
|
||||
if (this->config["osd_memlock"] == "true" || this->config["osd_memlock"] == "1" || this->config["osd_memlock"] == "yes")
|
||||
{
|
||||
// Lock all OSD memory if requested
|
||||
if (mlockall(MCL_CURRENT|MCL_FUTURE
|
||||
#ifdef MCL_ONFAULT
|
||||
| MCL_ONFAULT
|
||||
#endif
|
||||
) != 0)
|
||||
{
|
||||
fprintf(stderr, "osd_memlock is set to true, but mlockall() failed: %s\n", strerror(errno));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
print_stats();
|
||||
@@ -185,46 +200,7 @@ void osd_t::bind_socket()
|
||||
|
||||
// FIXME Support multiple listening sockets
|
||||
|
||||
sockaddr addr;
|
||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
||||
{
|
||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
||||
}
|
||||
|
||||
listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (listen_fd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
||||
}
|
||||
int enable = 1;
|
||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
||||
|
||||
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
||||
}
|
||||
if (bind_port == 0)
|
||||
{
|
||||
socklen_t len = sizeof(addr);
|
||||
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
|
||||
}
|
||||
listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
}
|
||||
else
|
||||
{
|
||||
listening_port = bind_port;
|
||||
}
|
||||
|
||||
if (listen(listen_fd, listen_backlog) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
||||
}
|
||||
|
||||
listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
|
||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
|
||||
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
||||
|
@@ -211,7 +211,7 @@ class osd_t
|
||||
// flushing, recovery and backfill
|
||||
void submit_pg_flush_ops(pg_t & pg);
|
||||
void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
|
||||
void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||
bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||
bool pick_next_recovery(osd_recovery_op_t &op);
|
||||
void submit_recovery_op(osd_recovery_op_t *op);
|
||||
bool continue_recovery();
|
||||
|
@@ -457,7 +457,8 @@ void osd_t::renew_lease()
|
||||
if (err == "" && data["result"]["TTL"].string_value() == "")
|
||||
{
|
||||
// Die
|
||||
throw std::runtime_error("etcd lease has expired");
|
||||
fprintf(stderr, "Error refreshing etcd lease\n");
|
||||
force_stop(1);
|
||||
}
|
||||
if (err != "")
|
||||
{
|
||||
@@ -466,7 +467,8 @@ void osd_t::renew_lease()
|
||||
if (etcd_failed_attempts > st_cli.max_etcd_attempts)
|
||||
{
|
||||
// Die
|
||||
throw std::runtime_error("Cluster connection failed");
|
||||
fprintf(stderr, "Cluster connection failed\n");
|
||||
force_stop(1);
|
||||
}
|
||||
// Retry
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
|
||||
|
@@ -47,7 +47,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
||||
if (l.second.size() > 0)
|
||||
{
|
||||
fb->flush_ops++;
|
||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
|
||||
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data()))
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (auto & l: fb->stable_lists)
|
||||
@@ -55,7 +56,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
||||
if (l.second.size() > 0)
|
||||
{
|
||||
fb->flush_ops++;
|
||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
|
||||
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data()))
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -160,7 +162,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
||||
bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
// Copy buffer so it gets freed along with the operation
|
||||
@@ -188,10 +190,8 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
int peer_fd = msgr.osd_peer_fds[peer_osd];
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
||||
op->peer_fd = peer_fd;
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_stab = {
|
||||
.header = {
|
||||
@@ -207,8 +207,21 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
||||
delete op;
|
||||
};
|
||||
msgr.outbox_push(op);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
op->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
op->callback(op);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||
|
@@ -29,8 +29,10 @@ void osd_t::handle_peers()
|
||||
degraded_objects += p.second.degraded_objects.size();
|
||||
if (p.second.state & PG_HAS_UNCLEAN)
|
||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||
else if (p.second.state & PG_HAS_DEGRADED)
|
||||
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
ringloop->wakeup();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -340,7 +342,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cl->peer_fd;
|
||||
@@ -394,7 +396,9 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
{
|
||||
if (op->bs_op->retval < 0)
|
||||
{
|
||||
throw std::runtime_error("local OP_LIST failed");
|
||||
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
|
||||
force_stop(1);
|
||||
return;
|
||||
}
|
||||
add_bs_subop_stats(op);
|
||||
printf(
|
||||
@@ -419,7 +423,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
// Peer
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds[role_osd];
|
||||
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
|
@@ -246,7 +246,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
// Send to a remote OSD
|
||||
osd_op_t *subop = op_data->subops+subop_idx;
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
|
||||
// FIXME: Use the pre-allocated buffer
|
||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||
subop->req = (osd_any_op_t){
|
||||
@@ -287,7 +286,18 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
}
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(subop);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(subop_osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
}
|
||||
subop_idx++;
|
||||
}
|
||||
prev = i+1;
|
||||
|
@@ -182,7 +182,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->req.sec_rw = {
|
||||
@@ -225,7 +224,18 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(subop);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(role_osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
@@ -463,7 +473,6 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
@@ -477,7 +486,18 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(&subops[i]);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(chunk.osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -567,7 +587,6 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
@@ -581,7 +600,18 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(&subops[i]);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(stab_osd.osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -8,7 +8,7 @@
|
||||
#include "osd_id.h"
|
||||
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 512
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
struct buf_len_t
|
||||
|
@@ -134,14 +134,14 @@ int main(int narg, char *args[])
|
||||
|
||||
int connect_osd(const char *osd_address, int osd_port)
|
||||
{
|
||||
struct sockaddr addr;
|
||||
struct sockaddr_storage addr;
|
||||
if (!string_to_addr(osd_address, 0, osd_port, &addr))
|
||||
{
|
||||
fprintf(stderr, "server address: %s is not valid\n", osd_address);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
int connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (connect_fd < 0)
|
||||
{
|
||||
perror("socket");
|
||||
|
@@ -112,3 +112,17 @@ void ring_loop_t::restore(unsigned sqe_tail)
|
||||
}
|
||||
ring.sq.sqe_tail = sqe_tail;
|
||||
}
|
||||
|
||||
int ring_loop_t::sqes_left()
|
||||
{
|
||||
struct io_uring_sq *sq = &ring.sq;
|
||||
unsigned int head = io_uring_smp_load_acquire(sq->khead);
|
||||
unsigned int next = sq->sqe_tail + 1;
|
||||
int left = *sq->kring_entries - (next - head);
|
||||
if (left > free_ring_data_ptr)
|
||||
{
|
||||
// return min(sqes left, ring_datas left)
|
||||
return free_ring_data_ptr;
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
@@ -17,15 +17,12 @@
|
||||
|
||||
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
||||
{
|
||||
sqe->opcode = op;
|
||||
sqe->flags = 0;
|
||||
sqe->ioprio = 0;
|
||||
sqe->fd = fd;
|
||||
sqe->off = offset;
|
||||
sqe->addr = (unsigned long) addr;
|
||||
sqe->len = len;
|
||||
sqe->rw_flags = 0;
|
||||
sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
|
||||
// Prepare a read/write operation without clearing user_data
|
||||
// Very recently, 22 Dec 2021, liburing finally got this change too (8ecd3fd959634df81d66af8b3a69c16202a014e8)
|
||||
// But all versions prior to it (sadly) clear user_data
|
||||
__u64 user_data = sqe->user_data;
|
||||
io_uring_prep_rw(op, sqe, fd, addr, len, offset);
|
||||
sqe->user_data = user_data;
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, unsigned nr_vecs, off_t offset)
|
||||
@@ -172,6 +169,7 @@ public:
|
||||
struct io_uring_cqe *cqe;
|
||||
return io_uring_wait_cqe(&ring, &cqe);
|
||||
}
|
||||
int sqes_left();
|
||||
inline unsigned space_left()
|
||||
{
|
||||
return free_ring_data_ptr;
|
||||
|
@@ -67,14 +67,14 @@ int main(int narg, char *args[])
|
||||
|
||||
int connect_stub(const char *server_address, int server_port)
|
||||
{
|
||||
struct sockaddr addr;
|
||||
struct sockaddr_storage addr;
|
||||
if (!string_to_addr(server_address, 0, server_port, &addr))
|
||||
{
|
||||
fprintf(stderr, "server address: %s is not valid\n", server_address);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
int connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (connect_fd < 0)
|
||||
{
|
||||
perror("socket");
|
||||
|
@@ -41,21 +41,19 @@
|
||||
#include "rw_blocking.h"
|
||||
#include "osd_ops.h"
|
||||
|
||||
int bind_stub(std::string bind_address, int bind_port);
|
||||
|
||||
void run_stub(int peer_fd);
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
int listen_fd = bind_stub("0.0.0.0", 11203);
|
||||
int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
|
||||
// Accept new connections
|
||||
sockaddr addr;
|
||||
sockaddr_storage addr;
|
||||
socklen_t peer_addr_size = sizeof(addr);
|
||||
int peer_fd;
|
||||
while (1)
|
||||
{
|
||||
printf("stub_osd: waiting for 1 client\n");
|
||||
peer_fd = accept(listen_fd, &addr, &peer_addr_size);
|
||||
peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size);
|
||||
if (peer_fd == -1)
|
||||
{
|
||||
if (errno == EAGAIN)
|
||||
@@ -76,39 +74,6 @@ int main(int narg, char *args[])
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bind_stub(std::string bind_address, int bind_port)
|
||||
{
|
||||
int listen_backlog = 128;
|
||||
|
||||
sockaddr addr;
|
||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
||||
{
|
||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
||||
}
|
||||
|
||||
int listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (listen_fd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
||||
}
|
||||
int enable = 1;
|
||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
||||
|
||||
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
||||
}
|
||||
|
||||
if (listen(listen_fd, listen_backlog) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
||||
}
|
||||
|
||||
return listen_fd;
|
||||
}
|
||||
|
||||
void run_stub(int peer_fd)
|
||||
{
|
||||
osd_any_op_t op;
|
||||
|
@@ -25,8 +25,6 @@
|
||||
#include "epoll_manager.h"
|
||||
#include "messenger.h"
|
||||
|
||||
int bind_stub(std::string bind_address, int bind_port);
|
||||
|
||||
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
|
||||
|
||||
int main(int narg, char *args[])
|
||||
@@ -43,7 +41,8 @@ int main(int narg, char *args[])
|
||||
json11::Json config = json11::Json::object { { "log_level", 1 } };
|
||||
msgr->parse_config(config);
|
||||
// Accept new connections
|
||||
int listen_fd = bind_stub("0.0.0.0", 11203);
|
||||
int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
|
||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->set_fd_handler(listen_fd, false, [listen_fd, msgr](int fd, int events)
|
||||
{
|
||||
msgr->accept_connections(listen_fd);
|
||||
@@ -67,41 +66,6 @@ int main(int narg, char *args[])
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bind_stub(std::string bind_address, int bind_port)
|
||||
{
|
||||
int listen_backlog = 128;
|
||||
|
||||
sockaddr addr;
|
||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
||||
{
|
||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
||||
}
|
||||
|
||||
int listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (listen_fd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
||||
}
|
||||
int enable = 1;
|
||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
||||
|
||||
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
||||
}
|
||||
|
||||
if (listen(listen_fd, listen_backlog) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
||||
}
|
||||
|
||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
|
||||
return listen_fd;
|
||||
}
|
||||
|
||||
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
|
||||
{
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 0.6.12
|
||||
Version: 0.6.15
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
Reference in New Issue
Block a user