Compare commits
41 Commits
Author | SHA1 | Date | |
---|---|---|---|
85298ddae2 | |||
e23296a327 | |||
839ec9e6e0 | |||
7cbfdff41a | |||
951272f27f | |||
a3fb1d4c98 | |||
88402e6eb6 | |||
390239c51b | |||
b7b2adfa32 | |||
36c276358b | |||
117d6f0612 | |||
7d79c58095 | |||
46d2bc100f | |||
732e2804e9 | |||
abaec2008c | |||
8129d238a4 | |||
61ebed144a | |||
9d3ba113aa | |||
9788045dc9 | |||
d6b0d29af6 | |||
36f352f06f | |||
318cc463c2 | |||
145e5cfb86 | |||
73ae578981 | |||
20ee4ed758 | |||
63de79d1b2 | |||
f712967079 | |||
df0cd85352 | |||
ebaf4d7a72 | |||
d4bc10542c | |||
140309620a | |||
0a610ee943 | |||
f3ce166064 | |||
717d303370 | |||
d9857a5340 | |||
eb5d9153e8 | |||
ae6d1ed1d5 | |||
d123e58ea3 | |||
d9869d8116 | |||
4047ca606f | |||
218e294e9c |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
|||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "0.6.12")
|
set(VERSION "0.6.15")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
@@ -55,11 +55,11 @@ Vitastor на данный момент находится в статусе п
|
|||||||
|
|
||||||
## Планы развития
|
## Планы развития
|
||||||
|
|
||||||
- Поддержка удаления снапшотов (слияния слоёв)
|
|
||||||
- Более корректные скрипты разметки дисков и автоматического запуска OSD
|
- Более корректные скрипты разметки дисков и автоматического запуска OSD
|
||||||
- Другие инструменты администрирования
|
- Другие инструменты администрирования
|
||||||
- Плагины для OpenNebula и других облачных систем
|
- Плагины для OpenNebula и других облачных систем
|
||||||
- iSCSI-прокси
|
- iSCSI-прокси
|
||||||
|
- Упрощённый NFS прокси
|
||||||
- Более быстрое переключение при отказах
|
- Более быстрое переключение при отказах
|
||||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||||
- Контрольные суммы
|
- Контрольные суммы
|
||||||
|
@@ -49,11 +49,11 @@ breaking changes in the future. However, the following is implemented:
|
|||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
- Snapshot deletion (layer merge) support
|
|
||||||
- Better OSD creation and auto-start tools
|
- Better OSD creation and auto-start tools
|
||||||
- Other administrative tools
|
- Other administrative tools
|
||||||
- Plugins for OpenNebula, Proxmox and other cloud systems
|
- Plugins for OpenNebula and other cloud systems
|
||||||
- iSCSI proxy
|
- iSCSI proxy
|
||||||
|
- Simplified NFS proxy
|
||||||
- Faster failover
|
- Faster failover
|
||||||
- Scrubbing without checksums (verification of replicas)
|
- Scrubbing without checksums (verification of replicas)
|
||||||
- Checksums
|
- Checksums
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
VERSION ?= v0.6.12
|
VERSION ?= v0.6.15
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
@@ -49,7 +49,7 @@ spec:
|
|||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v0.6.12
|
image: vitalif/vitastor-csi:v0.6.15
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -116,7 +116,7 @@ spec:
|
|||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v0.6.12
|
image: vitalif/vitastor-csi:v0.6.15
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -5,7 +5,7 @@ package vitastor
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "0.6.12"
|
vitastorCSIDriverVersion = "0.6.15"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
|||||||
vitastor (0.6.12-1) unstable; urgency=medium
|
vitastor (0.6.15-1) unstable; urgency=medium
|
||||||
|
|
||||||
* RDMA support
|
* RDMA support
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -33,8 +33,8 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-0.6.12; \
|
cp -r /root/vitastor vitastor-0.6.15; \
|
||||||
cd vitastor-0.6.12; \
|
cd vitastor-0.6.15; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
@@ -47,8 +47,8 @@ RUN set -e -x; \
|
|||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.12.orig.tar.xz vitastor-0.6.12; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.15.orig.tar.xz vitastor-0.6.15; \
|
||||||
cd vitastor-0.6.12; \
|
cd vitastor-0.6.15; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
35
docs/params/common.yml
Normal file
35
docs/params/common.yml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
- name: config_path
|
||||||
|
type: string
|
||||||
|
default: "/etc/vitastor/vitastor.conf"
|
||||||
|
info: |
|
||||||
|
Path to the JSON configuration file. Configuration file is optional,
|
||||||
|
a non-existing configuration file does not prevent Vitastor from
|
||||||
|
running if required parameters are specified.
|
||||||
|
info_ru: |
|
||||||
|
Путь к файлу конфигурации в формате JSON. Файл конфигурации необязателен,
|
||||||
|
без него Vitastor тоже будет работать, если переданы необходимые параметры.
|
||||||
|
- name: etcd_address
|
||||||
|
type: string or array of strings
|
||||||
|
type_ru: строка или массив строк
|
||||||
|
info: |
|
||||||
|
etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
|
||||||
|
specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||||
|
Note that https is not supported for etcd connections yet.
|
||||||
|
info_ru: |
|
||||||
|
Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
|
||||||
|
или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||||
|
- name: etcd_prefix
|
||||||
|
type: string
|
||||||
|
default: "/vitastor"
|
||||||
|
info: |
|
||||||
|
Prefix for all keys in etcd used by Vitastor. You can change prefix and, for
|
||||||
|
example, use a single etcd cluster for multiple Vitastor clusters.
|
||||||
|
info_ru: |
|
||||||
|
Префикс для ключей etcd, которые использует Vitastor. Вы можете задать другой
|
||||||
|
префикс, например, чтобы запустить несколько кластеров Vitastor с одним
|
||||||
|
кластером etcd.
|
||||||
|
- name: log_level
|
||||||
|
type: int
|
||||||
|
default: 0
|
||||||
|
info: Log level. Raise if you want more verbose output.
|
||||||
|
info_ru: Уровень логгирования. Повысьте, если хотите более подробный вывод.
|
200
docs/params/layout-cluster.yml
Normal file
200
docs/params/layout-cluster.yml
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
- name: block_size
|
||||||
|
type: int
|
||||||
|
default: 131072
|
||||||
|
info: |
|
||||||
|
Size of objects (data blocks) into which all physical and virtual drives are
|
||||||
|
subdivided in Vitastor. One of current main settings in Vitastor, affects
|
||||||
|
memory usage, write amplification and I/O load distribution effectiveness.
|
||||||
|
|
||||||
|
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
|
||||||
|
it's possible to use 4 MB for SSD too - it will lower memory usage, but
|
||||||
|
may increase average WA and reduce linear performance.
|
||||||
|
|
||||||
|
OSDs with different block sizes (for example, SSD and SSD+HDD OSDs) can
|
||||||
|
currently coexist in one etcd instance only within separate Vitastor
|
||||||
|
clusters with different etcd_prefix'es.
|
||||||
|
|
||||||
|
Also block size can't be changed after OSD initialization without losing
|
||||||
|
data.
|
||||||
|
|
||||||
|
You must always specify block_size in etcd in /vitastor/config/global if
|
||||||
|
you change it so all clients can know about it.
|
||||||
|
|
||||||
|
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
|
||||||
|
544 MB per 1 TB of used disk space with the default 128 KB block size.
|
||||||
|
info_ru: |
|
||||||
|
Размер объектов (блоков данных), на которые делятся физические и виртуальные
|
||||||
|
диски в Vitastor. Одна из ключевых на данный момент настроек, влияет на
|
||||||
|
потребление памяти, объём избыточной записи (write amplification) и
|
||||||
|
эффективность распределения нагрузки по OSD.
|
||||||
|
|
||||||
|
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
|
||||||
|
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
|
||||||
|
это понизит использование памяти, но ухудшит распределение нагрузки и в
|
||||||
|
среднем увеличит WA.
|
||||||
|
|
||||||
|
OSD с разными размерами блока (например, SSD и SSD+HDD OSD) на данный
|
||||||
|
момент могут сосуществовать в рамках одного etcd только в виде двух независимых
|
||||||
|
кластеров Vitastor с разными etcd_prefix.
|
||||||
|
|
||||||
|
Также размер блока нельзя менять после инициализации OSD без потери данных.
|
||||||
|
|
||||||
|
Если вы меняете размер блока, обязательно прописывайте его в etcd в
|
||||||
|
/vitastor/config/global, дабы все клиенты его знали.
|
||||||
|
|
||||||
|
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
|
||||||
|
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
|
||||||
|
стандартном 128 КБ блоке.
|
||||||
|
- name: bitmap_granularity
|
||||||
|
type: int
|
||||||
|
default: 4096
|
||||||
|
info: |
|
||||||
|
Required virtual disk write alignment ("sector size"). Must be a multiple
|
||||||
|
of disk_alignment. It's called bitmap granularity because Vitastor tracks
|
||||||
|
an allocation bitmap for each object containing 2 bits per each
|
||||||
|
(bitmap_granularity) bytes.
|
||||||
|
|
||||||
|
This parameter can't be changed after OSD initialization without losing
|
||||||
|
data. Also it's fixed for the whole Vitastor cluster i.e. two different
|
||||||
|
values can't be used in a single Vitastor cluster.
|
||||||
|
|
||||||
|
Clients MUST be aware of this parameter value, so put it into etcd key
|
||||||
|
/vitastor/config/global if you change it for any reason.
|
||||||
|
info_ru: |
|
||||||
|
Требуемое выравнивание записи на виртуальные диски (размер их "сектора").
|
||||||
|
Должен быть кратен disk_alignment. Называется гранулярностью битовой карты
|
||||||
|
потому, что Vitastor хранит битовую карту для каждого объекта, содержащую
|
||||||
|
по 2 бита на каждые (bitmap_granularity) байт.
|
||||||
|
|
||||||
|
Данный параметр нельзя менять после инициализации OSD без потери данных.
|
||||||
|
Также он фиксирован для всего кластера Vitastor, т.е. разные значения
|
||||||
|
не могут сосуществовать в одном кластере.
|
||||||
|
|
||||||
|
Клиенты ДОЛЖНЫ знать правильное значение этого параметра, так что если вы
|
||||||
|
его меняете, обязательно прописывайте изменённое значение в etcd в ключ
|
||||||
|
/vitastor/config/global.
|
||||||
|
- name: immediate_commit
|
||||||
|
type: string
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Another parameter which is really important for performance.
|
||||||
|
|
||||||
|
Desktop SSDs are very fast (100000+ iops) for simple random writes
|
||||||
|
without cache flush. However, they are really slow (only around 1000 iops)
|
||||||
|
if you try to fsync() each write, that is, when you want to guarantee that
|
||||||
|
each change gets immediately persisted to the physical media.
|
||||||
|
|
||||||
|
Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
|
||||||
|
"Supercapacitor-based Power Loss Protection", on the other hand, are equally
|
||||||
|
fast with and without fsync because their cache is protected from sudden
|
||||||
|
power loss by a built-in supercapacitor-based "UPS".
|
||||||
|
|
||||||
|
Some software-defined storage systems always fsync each write and thus are
|
||||||
|
really slow when used with desktop SSDs. Vitastor, however, can also
|
||||||
|
efficiently utilize desktop SSDs by postponing fsync until the client calls
|
||||||
|
it explicitly.
|
||||||
|
|
||||||
|
This is what this parameter regulates. When it's set to "all" the whole
|
||||||
|
Vitastor cluster commits each change to disks immediately and clients just
|
||||||
|
ignore fsyncs because they know for sure that they're unneeded. This reduces
|
||||||
|
the amount of network roundtrips performed by clients and improves
|
||||||
|
performance. So it's always better to use server grade SSDs with
|
||||||
|
supercapacitors even with Vitastor, especially given that they cost only
|
||||||
|
a bit more than desktop models.
|
||||||
|
|
||||||
|
There is also a common SATA SSD (and HDD too!) firmware bug (or feature)
|
||||||
|
that makes server SSDs which have supercapacitors slow with fsync. To check
|
||||||
|
if your SSDs are affected, compare benchmark results from `fio -name=test
|
||||||
|
-ioengine=libaio -direct=1 -bs=4k -rw=randwrite -iodepth=1` with and without
|
||||||
|
`-fsync=1`. Results should be the same. If fsync=1 result is worse you can
|
||||||
|
try to work around this bug by "disabling" drive write-back cache by running
|
||||||
|
`hdparm -W 0 /dev/sdXX` or `echo write through > /sys/block/sdXX/device/scsi_disk/*/cache_type`
|
||||||
|
(IMPORTANT: don't mistake it with `/sys/block/sdXX/queue/write_cache` - it's
|
||||||
|
unsafe to change by hand). The same may apply to newer HDDs with internal
|
||||||
|
SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
|
||||||
|
it (they have internal SSD cache even though it's not stated in datasheets).
|
||||||
|
|
||||||
|
This parameter must be set both in etcd in /vitastor/config/global and in
|
||||||
|
OSD command line or configuration. Setting it to "all" or "small" requires
|
||||||
|
enabling disable_journal_fsync and disable_meta_fsync, setting it to "all"
|
||||||
|
also requires enabling disable_data_fsync.
|
||||||
|
|
||||||
|
TLDR: For optimal performance, set immediate_commit to "all" if you only use
|
||||||
|
SSDs with supercapacitor-based power loss protection (nonvolatile
|
||||||
|
write-through cache) for both data and journals in the whole Vitastor
|
||||||
|
cluster. Set it to "small" if you only use such SSDs for journals. Leave
|
||||||
|
empty if your drives have write-back cache.
|
||||||
|
info_ru: |
|
||||||
|
Ещё один важный для производительности параметр.
|
||||||
|
|
||||||
|
Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
|
||||||
|
секунду) при простой случайной записи без сбросов кэша. Однако они очень
|
||||||
|
медленные (всего порядка 1000 iops), если вы пытаетесь сбрасывать кэш после
|
||||||
|
каждой записи, то есть, если вы пытаетесь гарантировать, что каждое
|
||||||
|
изменение физически записывается в энергонезависимую память.
|
||||||
|
|
||||||
|
С другой стороны, серверные SSD с конденсаторами - функцией, называемой
|
||||||
|
"Advanced/Enhanced Power Loss Protection" или просто "Supercapacitor-based
|
||||||
|
Power Loss Protection" - одинаково быстрые и со сбросом кэша, и без
|
||||||
|
него, потому что их кэш защищён от потери питания встроенным "источником
|
||||||
|
бесперебойного питания" на основе суперконденсаторов и на самом деле они
|
||||||
|
его никогда не сбрасывают.
|
||||||
|
|
||||||
|
Некоторые программные СХД всегда сбрасывают кэши дисков при каждой записи
|
||||||
|
и поэтому работают очень медленно с настольными SSD. Vitastor, однако, может
|
||||||
|
откладывать fsync до явного его вызова со стороны клиента и таким образом
|
||||||
|
эффективно утилизировать настольные SSD.
|
||||||
|
|
||||||
|
Данный параметр влияет как раз на это. Когда он установлен в значение "all",
|
||||||
|
весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
|
||||||
|
носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
|
||||||
|
знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
|
||||||
|
по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
|
||||||
|
использовать только серверные модели SSD с суперконденсаторами, особенно
|
||||||
|
учитывая то, что стоят они ненамного дороже настольных.
|
||||||
|
|
||||||
|
Также в прошивках SATA SSD (и даже HDD!) очень часто встречается либо баг,
|
||||||
|
либо просто особенность логики, из-за которой серверные SSD, имеющие
|
||||||
|
конденсаторы и защиту от потери питания, всё равно медленно работают с
|
||||||
|
fsync. Чтобы понять, подвержены ли этой проблеме ваши SSD, сравните
|
||||||
|
результаты тестов `fio -name=test -ioengine=libaio -direct=1 -bs=4k
|
||||||
|
-rw=randwrite -iodepth=1` без и с опцией `-fsync=1`. Результаты должны
|
||||||
|
быть одинаковые. Если результат с `fsync=1` хуже, вы можете попробовать
|
||||||
|
обойти проблему, "отключив" кэш записи диска командой `hdparm -W 0 /dev/sdXX`
|
||||||
|
либо `echo write through > /sys/block/sdXX/device/scsi_disk/*/cache_type`
|
||||||
|
(ВАЖНО: не перепутайте с `/sys/block/sdXX/queue/write_cache` - этот параметр
|
||||||
|
менять руками небезопасно). Такая же проблема может встречаться и в новых
|
||||||
|
HDD-дисках с внутренним SSD или "медиа" кэшем - например, она встречается во
|
||||||
|
многих дисках Seagate EXOS (у них есть внутренний SSD-кэш, хотя это и не
|
||||||
|
указано в спецификациях).
|
||||||
|
|
||||||
|
Данный параметр нужно указывать и в etcd в /vitastor/config/global, и в
|
||||||
|
командной строке или конфигурации OSD. Значения "all" и "small" требуют
|
||||||
|
включения disable_journal_fsync и disable_meta_fsync, значение "all" также
|
||||||
|
требует включения disable_data_fsync.
|
||||||
|
|
||||||
|
Итого, вкратце: для оптимальной производительности установите
|
||||||
|
immediate_commit в значение "all", если вы используете в кластере только SSD
|
||||||
|
с суперконденсаторами и для данных, и для журналов. Если вы используете
|
||||||
|
такие SSD для всех журналов, но не для данных - можете установить параметр
|
||||||
|
в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
|
||||||
|
оставьте параметр пустым.
|
||||||
|
- name: client_dirty_limit
|
||||||
|
type: int
|
||||||
|
default: 33554432
|
||||||
|
info: |
|
||||||
|
Without immediate_commit=all this parameter sets the limit of "dirty"
|
||||||
|
(not committed by fsync) data allowed by the client before forcing an
|
||||||
|
additional fsync and committing the data. Also note that the client always
|
||||||
|
holds a copy of uncommitted data in memory so this setting also affects
|
||||||
|
RAM usage of clients.
|
||||||
|
|
||||||
|
This parameter doesn't affect OSDs themselves.
|
||||||
|
info_ru: |
|
||||||
|
При работе без immediate_commit=all - это лимит объёма "грязных" (не
|
||||||
|
зафиксированных fsync-ом) данных, при достижении которого клиент будет
|
||||||
|
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
|
||||||
|
что в этом случае до момента fsync клиент хранит копию незафиксированных
|
||||||
|
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
|
||||||
|
|
||||||
|
Параметр не влияет на сами OSD.
|
205
docs/params/layout-osd.yml
Normal file
205
docs/params/layout-osd.yml
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
- name: data_device
|
||||||
|
type: string
|
||||||
|
info: |
|
||||||
|
Path to the block device to use for data. It's highly recommendded to use
|
||||||
|
stable paths for all device names: `/dev/disk/by-partuuid/xxx...` instead
|
||||||
|
of just `/dev/sda` or `/dev/nvme0n1` to not mess up after server restart.
|
||||||
|
Files can also be used instead of block devices, but this is implemented
|
||||||
|
only for testing purposes and not for production.
|
||||||
|
info_ru: |
|
||||||
|
Путь к диску (блочному устройству) для хранения данных. Крайне рекомендуется
|
||||||
|
использовать стабильные пути: `/dev/disk/by-partuuid/xxx...` вместо простых
|
||||||
|
`/dev/sda` или `/dev/nvme0n1`, чтобы пути не могли спутаться после
|
||||||
|
перезагрузки сервера. Также вместо блочных устройств можно указывать файлы,
|
||||||
|
но это реализовано только для тестирования, а не для боевой среды.
|
||||||
|
- name: meta_device
|
||||||
|
type: string
|
||||||
|
info: |
|
||||||
|
Path to the block device to use for the metadata. Metadata must be on a fast
|
||||||
|
SSD or performance will suffer. If this option is skipped, `data_device` is
|
||||||
|
used for the metadata.
|
||||||
|
info_ru: |
|
||||||
|
Путь к диску метаданных. Метаданные должны располагаться на быстром
|
||||||
|
SSD-диске, иначе производительность пострадает. Если эта опция не указана,
|
||||||
|
для метаданных используется `data_device`.
|
||||||
|
- name: journal_device
|
||||||
|
type: string
|
||||||
|
info: |
|
||||||
|
Path to the block device to use for the journal. Journal must be on a fast
|
||||||
|
SSD or performance will suffer. If this option is skipped, `meta_device` is
|
||||||
|
used for the journal, and if it's also empty, journal is put on
|
||||||
|
`data_device`. It's almost always fine to put metadata and journal on the
|
||||||
|
same device, in this case you only need to set `meta_device`.
|
||||||
|
info_ru: |
|
||||||
|
Путь к диску журнала. Журнал должен располагаться на быстром SSD-диске,
|
||||||
|
иначе производительность пострадает. Если эта опция не указана,
|
||||||
|
для журнала используется `meta_device`, если же пуста и она, журнал
|
||||||
|
располагается на `data_device`. Нормально располагать журнал и метаданные
|
||||||
|
на одном устройстве, в этом случае достаточно указать только `meta_device`.
|
||||||
|
- name: journal_offset
|
||||||
|
type: int
|
||||||
|
default: 0
|
||||||
|
info: Offset on the device in bytes where the journal is stored.
|
||||||
|
info_ru: Смещение на устройстве в байтах, по которому располагается журнал.
|
||||||
|
- name: journal_size
|
||||||
|
type: int
|
||||||
|
info: |
|
||||||
|
Journal size in bytes. Doesn't have to be large, 16-32 MB is usually fine.
|
||||||
|
By default, the whole journal device will be used for the journal. You must
|
||||||
|
set it to some value manually (or use make-osd.sh) if you colocate the
|
||||||
|
journal with data or metadata.
|
||||||
|
info_ru: |
|
||||||
|
Размер журнала в байтах. Большим быть не обязан, 16-32 МБ обычно достаточно.
|
||||||
|
По умолчанию для журнала используется всё устройство журнала. Если же вы
|
||||||
|
размещаете журнал на устройстве данных или метаданных, то вы должны
|
||||||
|
установить эту опцию в какое-то значение сами (или использовать скрипт
|
||||||
|
make-osd.sh).
|
||||||
|
- name: meta_offset
|
||||||
|
type: int
|
||||||
|
default: 0
|
||||||
|
info: |
|
||||||
|
Offset on the device in bytes where the metadata area is stored.
|
||||||
|
Again, set it to something if you colocate metadata with journal or data.
|
||||||
|
info_ru: |
|
||||||
|
Смещение на устройстве в байтах, по которому располагаются метаданные.
|
||||||
|
Эту опцию нужно задать, если метаданные у вас хранятся на том же
|
||||||
|
устройстве, что данные или журнал.
|
||||||
|
- name: data_offset
|
||||||
|
type: int
|
||||||
|
default: 0
|
||||||
|
info: |
|
||||||
|
Offset on the device in bytes where the data area is stored.
|
||||||
|
Again, set it to something if you colocate data with journal or metadata.
|
||||||
|
info_ru: |
|
||||||
|
Смещение на устройстве в байтах, по которому располагаются данные.
|
||||||
|
Эту опцию нужно задать, если данные у вас хранятся на том же
|
||||||
|
устройстве, что метаданные или журнал.
|
||||||
|
- name: data_size
|
||||||
|
type: int
|
||||||
|
info: |
|
||||||
|
Data area size in bytes. By default, the whole data device up to the end
|
||||||
|
will be used for the data area, but you can restrict it if you want to use
|
||||||
|
a smaller part. Note that there is no option to set metadata area size -
|
||||||
|
it's derived from the data area size.
|
||||||
|
info_ru: |
|
||||||
|
Размер области данных в байтах. По умолчанию под данные будет использована
|
||||||
|
вся доступная область устройства данных до конца устройства, но вы можете
|
||||||
|
использовать эту опцию, чтобы ограничить её меньшим размером. Заметьте, что
|
||||||
|
опции размера области метаданных нет - она вычисляется из размера области
|
||||||
|
данных автоматически.
|
||||||
|
- name: meta_block_size
|
||||||
|
type: int
|
||||||
|
default: 4096
|
||||||
|
info: |
|
||||||
|
Physical block size of the metadata device. 4096 for most current
|
||||||
|
HDDs and SSDs.
|
||||||
|
info_ru: |
|
||||||
|
Размер физического блока устройства метаданных. 4096 для большинства
|
||||||
|
современных SSD и HDD.
|
||||||
|
- name: journal_block_size
|
||||||
|
type: int
|
||||||
|
default: 4096
|
||||||
|
info: |
|
||||||
|
Physical block size of the journal device. Must be a multiple of
|
||||||
|
`disk_alignment`. 4096 for most current HDDs and SSDs.
|
||||||
|
info_ru: |
|
||||||
|
Размер физического блока устройства журнала. Должен быть кратен
|
||||||
|
`disk_alignment`. 4096 для большинства современных SSD и HDD.
|
||||||
|
- name: disable_data_fsync
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Do not issue fsyncs to the data device, i.e. do not flush its cache.
|
||||||
|
Safe ONLY if your data device has write-through cache. If you disable
|
||||||
|
the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
|
||||||
|
that the cache disable command is run every time before starting Vitastor
|
||||||
|
OSD, for example, in the systemd unit. See also `immediate_commit` option
|
||||||
|
for the instructions to disable cache and how to benefit from it.
|
||||||
|
info_ru: |
|
||||||
|
Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
|
||||||
|
Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
|
||||||
|
записью (write-through). Если вы отключаете кэш через `hdparm` или
|
||||||
|
`scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
|
||||||
|
выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
|
||||||
|
Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
|
||||||
|
и о том, как из этого извлечь выгоду.
|
||||||
|
- name: disable_meta_fsync
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Same as disable_data_fsync, but for the metadata device. If the metadata
|
||||||
|
device is not set or if the data device is used for the metadata the option
|
||||||
|
is ignored and disable_data_fsync value is used instead of it.
|
||||||
|
info_ru: |
|
||||||
|
То же, что disable_data_fsync, но для устройства метаданных. Если устройство
|
||||||
|
метаданных не задано или если оно равно устройству данных, значение опции
|
||||||
|
игнорируется и вместо него используется значение опции disable_data_fsync.
|
||||||
|
- name: disable_journal_fsync
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Same as disable_data_fsync, but for the journal device. If the journal
|
||||||
|
device is not set or if the metadata device is used for the journal the
|
||||||
|
option is ignored and disable_meta_fsync value is used instead of it. If
|
||||||
|
the same device is used for data, metadata and journal the option is also
|
||||||
|
ignored and disable_data_fsync value is used instead of it.
|
||||||
|
info_ru: |
|
||||||
|
То же, что disable_data_fsync, но для устройства журнала. Если устройство
|
||||||
|
журнала не задано или если оно равно устройству метаданных, значение опции
|
||||||
|
игнорируется и вместо него используется значение опции disable_meta_fsync.
|
||||||
|
Если одно и то же устройство используется и под данные, и под журнал, и под
|
||||||
|
метаданные - значение опции также игнорируется и вместо него используется
|
||||||
|
значение опции disable_data_fsync.
|
||||||
|
- name: disable_device_lock
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Do not lock data, metadata and journal block devices exclusively with
|
||||||
|
flock(). Though it's not recommended, but you can use it you want to run
|
||||||
|
multiple OSD with a single device and different offsets, without using
|
||||||
|
partitions.
|
||||||
|
info_ru: |
|
||||||
|
Не блокировать устройства данных, метаданных и журнала от открытия их
|
||||||
|
другими OSD с помощью flock(). Так делать не рекомендуется, но теоретически
|
||||||
|
вы можете это использовать, чтобы запускать несколько OSD на одном
|
||||||
|
устройстве с разными смещениями и без использования разделов.
|
||||||
|
- name: disk_alignment
|
||||||
|
type: int
|
||||||
|
default: 4096
|
||||||
|
info: |
|
||||||
|
Required physical disk write alignment. Most current SSD and HDD drives
|
||||||
|
use 4 KB physical sectors even if they report 512 byte logical sector
|
||||||
|
size, so 4 KB is a good default setting.
|
||||||
|
|
||||||
|
Note, however, that physical sector size also affects WA, because with block
|
||||||
|
devices it's impossible to write anything smaller than a block. So, when
|
||||||
|
Vitastor has to write a single metadata entry that's only about 32 bytes in
|
||||||
|
size, it actually has to write the whole 4 KB sector.
|
||||||
|
|
||||||
|
Because of this it can actually be beneficial to use SSDs which work well
|
||||||
|
with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
|
||||||
|
and meta_block_size. But the only SSD that may fit into this category is
|
||||||
|
Intel Optane (probably, not tested yet).
|
||||||
|
|
||||||
|
Clients don't need to be aware of disk_alignment, so it's not required to
|
||||||
|
put a modified value into etcd key /vitastor/config/global.
|
||||||
|
info_ru: |
|
||||||
|
Требуемое выравнивание записи на физические диски. Почти все современные
|
||||||
|
SSD и HDD диски используют 4 КБ физические секторы, даже если показывают
|
||||||
|
логический размер сектора 512 байт, поэтому 4 КБ - хорошее значение по
|
||||||
|
умолчанию.
|
||||||
|
|
||||||
|
Однако стоит понимать, что физический размер сектора тоже влияет на
|
||||||
|
избыточную запись (WA), потому что ничего меньше блока (сектора) на блочное
|
||||||
|
устройство записать невозможно. Таким образом, когда Vitastor-у нужно
|
||||||
|
записать на диск всего лишь одну 32-байтную запись метаданных, фактически
|
||||||
|
приходится перезаписывать 4 КБ сектор целиком.
|
||||||
|
|
||||||
|
Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
|
||||||
|
меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
|
||||||
|
journal_block_size и meta_block_size. Однако единственные SSD, которые
|
||||||
|
теоретически могут попасть в эту категорию - это Intel Optane (но и это
|
||||||
|
пока не проверялось автором).
|
||||||
|
|
||||||
|
Клиентам не обязательно знать про disk_alignment, так что помещать значение
|
||||||
|
этого параметра в etcd в /vitastor/config/global не нужно.
|
65
docs/params/monitor.yml
Normal file
65
docs/params/monitor.yml
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
- name: etcd_mon_ttl
|
||||||
|
type: sec
|
||||||
|
min: 10
|
||||||
|
default: 30
|
||||||
|
info: Monitor etcd lease refresh interval in seconds
|
||||||
|
info_ru: Интервал обновления etcd резервации (lease) монитором
|
||||||
|
- name: etcd_mon_timeout
|
||||||
|
type: ms
|
||||||
|
default: 1000
|
||||||
|
info: etcd request timeout used by monitor
|
||||||
|
info_ru: Таймаут выполнения запросов к etcd от монитора
|
||||||
|
- name: etcd_mon_retries
|
||||||
|
type: int
|
||||||
|
default: 5
|
||||||
|
info: Maximum number of attempts for one monitor etcd request
|
||||||
|
info_ru: Максимальное число попыток выполнения запросов к etcd монитором
|
||||||
|
- name: mon_change_timeout
|
||||||
|
type: ms
|
||||||
|
min: 100
|
||||||
|
default: 1000
|
||||||
|
info: Optimistic retry interval for monitor etcd modification requests
|
||||||
|
info_ru: Время повтора при коллизиях при запросах модификации в etcd, производимых монитором
|
||||||
|
- name: mon_stats_timeout
|
||||||
|
type: ms
|
||||||
|
min: 100
|
||||||
|
default: 1000
|
||||||
|
info: |
|
||||||
|
Interval for monitor to wait before updating aggregated statistics in
|
||||||
|
etcd after receiving OSD statistics updates
|
||||||
|
info_ru: |
|
||||||
|
Интервал, который монитор ожидает при изменении статистики по отдельным
|
||||||
|
OSD перед обновлением агрегированной статистики в etcd
|
||||||
|
- name: osd_out_time
|
||||||
|
type: sec
|
||||||
|
default: 600
|
||||||
|
info: |
|
||||||
|
Time after which a failed OSD is removed from the data distribution.
|
||||||
|
I.e. time which the monitor waits before attempting to restore data
|
||||||
|
redundancy using other OSDs.
|
||||||
|
info_ru: |
|
||||||
|
Время, через которое отключенный OSD исключается из распределения данных.
|
||||||
|
То есть, время, которое монитор ожидает перед попыткой переместить данные
|
||||||
|
на другие OSD и таким образом восстановить избыточность хранения.
|
||||||
|
- name: placement_levels
|
||||||
|
type: json
|
||||||
|
default: '`{"host":100,"osd":101}`'
|
||||||
|
info: |
|
||||||
|
Levels for the placement tree. You can define arbitrary tree levels by
|
||||||
|
defining them in this parameter. The configuration parameter value should
|
||||||
|
contain a JSON object with level names as keys and integer priorities as
|
||||||
|
values. Smaller priority means higher level in tree. For example,
|
||||||
|
"datacenter" should have smaller priority than "osd". "host" and "osd"
|
||||||
|
levels are always predefined and can't be removed. If one of them is not
|
||||||
|
present in the configuration, then it is defined with the default priority
|
||||||
|
(100 for "host", 101 for "osd").
|
||||||
|
info_ru: |
|
||||||
|
Определения уровней для дерева размещения OSD. Вы можете определять
|
||||||
|
произвольные уровни, помещая их в данный параметр конфигурации. Значение
|
||||||
|
параметра должно содержать JSON-объект, ключи которого будут являться
|
||||||
|
названиями уровней, а значения - целочисленными приоритетами. Меньшие
|
||||||
|
приоритеты соответствуют верхним уровням дерева. Например, уровень
|
||||||
|
"датацентр" должен иметь меньший приоритет, чем "OSD". Уровни с названиями
|
||||||
|
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
|
||||||
|
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
|
||||||
|
умолчанию (100 для уровня "host", 101 для "osd").
|
225
docs/params/network.yml
Normal file
225
docs/params/network.yml
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
- name: tcp_header_buffer_size
|
||||||
|
type: int
|
||||||
|
default: 65536
|
||||||
|
info: |
|
||||||
|
Size of the buffer used to read data using an additional copy. Vitastor
|
||||||
|
packet headers are 128 bytes, payload is always at least 4 KB, so it is
|
||||||
|
usually beneficial to try to read multiple packets at once even though
|
||||||
|
it requires to copy the data an additional time. The rest of each packet
|
||||||
|
is received without an additional copy. You can try to play with this
|
||||||
|
parameter and see how it affects random iops and linear bandwidth if you
|
||||||
|
want.
|
||||||
|
info_ru: |
|
||||||
|
Размер буфера для чтения данных с дополнительным копированием. Пакеты
|
||||||
|
Vitastor содержат 128-байтные заголовки, за которыми следуют данные размером
|
||||||
|
от 4 КБ и для мелких операций ввода-вывода обычно выгодно за 1 вызов читать
|
||||||
|
сразу несколько пакетов, даже не смотря на то, что это требует лишний раз
|
||||||
|
скопировать данные. Часть каждого пакета за пределами значения данного
|
||||||
|
параметра читается без дополнительного копирования. Вы можете попробовать
|
||||||
|
поменять этот параметр и посмотреть, как он влияет на производительность
|
||||||
|
случайного и линейного доступа.
|
||||||
|
- name: use_sync_send_recv
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
If true, synchronous send/recv syscalls are used instead of io_uring for
|
||||||
|
socket communication. Useless for OSDs because they require io_uring anyway,
|
||||||
|
but may be required for clients with old kernel versions.
|
||||||
|
info_ru: |
|
||||||
|
Если установлено в истину, то вместо io_uring для передачи данных по сети
|
||||||
|
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
||||||
|
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
||||||
|
принципе, это может применяться для клиентов со старыми версиями ядра.
|
||||||
|
- name: use_rdma
|
||||||
|
type: bool
|
||||||
|
default: true
|
||||||
|
info: |
|
||||||
|
Try to use RDMA for communication if it's available. Disable if you don't
|
||||||
|
want Vitastor to use RDMA. RDMA increases the performance, but TCP-only
|
||||||
|
clients can still talk to an RDMA-enabled cluster, so you don't need to
|
||||||
|
make sure that all clients support RDMA when enabling it.
|
||||||
|
info_ru: |
|
||||||
|
Пытаться использовать RDMA для связи при наличии доступных устройств.
|
||||||
|
Отключите, если вы не хотите, чтобы Vitastor использовал RDMA.
|
||||||
|
RDMA улучшает производительность, но
|
||||||
|
Клиенты и клиентов and TCP-only clients in the cluster at the
|
||||||
|
same time - TCP-only clients are still able to use an RDMA-enabled cluster.
|
||||||
|
- name: rdma_device
|
||||||
|
type: string
|
||||||
|
info: |
|
||||||
|
RDMA device name to use for Vitastor OSD communications (for example,
|
||||||
|
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
||||||
|
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
||||||
|
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||||
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
|
root to list available RDMA devices and their features.
|
||||||
|
info_ru: |
|
||||||
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
|
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||||
|
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
||||||
|
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
||||||
|
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||||
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
|
параметры и возможности.
|
||||||
|
- name: rdma_port_num
|
||||||
|
type: int
|
||||||
|
default: 1
|
||||||
|
info: |
|
||||||
|
RDMA device port number to use. Only for devices that have more than 1 port.
|
||||||
|
See `phys_port_cnt` in `ibv_devinfo -v` output to determine how many ports
|
||||||
|
your device has.
|
||||||
|
info_ru: |
|
||||||
|
Номер порта RDMA-устройства, который следует использовать. Имеет смысл
|
||||||
|
только для устройств, у которых более 1 порта. Чтобы узнать, сколько портов
|
||||||
|
у вашего адаптера, посмотрите `phys_port_cnt` в выводе команды
|
||||||
|
`ibv_devinfo -v`.
|
||||||
|
- name: rdma_gid_index
|
||||||
|
type: int
|
||||||
|
default: 0
|
||||||
|
info: |
|
||||||
|
Global address identifier index of the RDMA device to use. Different GID
|
||||||
|
indexes may correspond to different protocols like RoCEv1, RoCEv2 and iWARP.
|
||||||
|
Search for "GID" in `ibv_devinfo -v` output to determine which GID index
|
||||||
|
you need.
|
||||||
|
|
||||||
|
**IMPORTANT:** If you want to use RoCEv2 (as recommended) then the correct
|
||||||
|
rdma_gid_index is usually 1 (IPv6) or 3 (IPv4).
|
||||||
|
info_ru: |
|
||||||
|
Номер глобального идентификатора адреса RDMA-устройства, который следует
|
||||||
|
использовать. Разным gid_index могут соответствовать разные протоколы связи:
|
||||||
|
RoCEv1, RoCEv2, iWARP. Чтобы понять, какой нужен вам - смотрите строчки со
|
||||||
|
словом "GID" в выводе команды `ibv_devinfo -v`.
|
||||||
|
|
||||||
|
**ВАЖНО:** Если вы хотите использовать RoCEv2 (как мы и рекомендуем), то
|
||||||
|
правильный rdma_gid_index, как правило, 1 (IPv6) или 3 (IPv4).
|
||||||
|
- name: rdma_mtu
|
||||||
|
type: int
|
||||||
|
default: 4096
|
||||||
|
info: |
|
||||||
|
RDMA Path MTU to use. Must be 1024, 2048 or 4096. There is usually no
|
||||||
|
sense to change it from the default 4096.
|
||||||
|
info_ru: |
|
||||||
|
Максимальная единица передачи (Path MTU) для RDMA. Должно быть равно 1024,
|
||||||
|
2048 или 4096. Обычно нет смысла менять значение по умолчанию, равное 4096.
|
||||||
|
- name: rdma_max_sge
|
||||||
|
type: int
|
||||||
|
default: 128
|
||||||
|
info: |
|
||||||
|
Maximum number of scatter/gather entries to use for RDMA. OSDs negotiate
|
||||||
|
the actual value when establishing connection anyway, so it's usually not
|
||||||
|
required to change this parameter.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число записей разделения/сборки (scatter/gather) для RDMA.
|
||||||
|
OSD в любом случае согласовывают реальное значение при установке соединения,
|
||||||
|
так что менять этот параметр обычно не нужно.
|
||||||
|
- name: rdma_max_msg
|
||||||
|
type: int
|
||||||
|
default: 1048576
|
||||||
|
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||||
|
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||||
|
- name: rdma_max_recv
|
||||||
|
type: int
|
||||||
|
default: 8
|
||||||
|
info: |
|
||||||
|
Maximum number of parallel RDMA receive operations. Note that this number
|
||||||
|
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||||
|
so this setting actually affects memory usage. This is because RDMA receive
|
||||||
|
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||||
|
later versions.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||||
|
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||||
|
для каждого подключённого клиентского соединения, так что данная настройка
|
||||||
|
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||||
|
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||||
|
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||||
|
более новых версиях Vitastor.
|
||||||
|
- name: peer_connect_interval
|
||||||
|
type: sec
|
||||||
|
min: 1
|
||||||
|
default: 5
|
||||||
|
info: Interval before attempting to reconnect to an unavailable OSD.
|
||||||
|
info_ru: Время ожидания перед повторной попыткой соединиться с недоступным OSD.
|
||||||
|
- name: peer_connect_timeout
|
||||||
|
type: sec
|
||||||
|
min: 1
|
||||||
|
default: 5
|
||||||
|
info: Timeout for OSD connection attempts.
|
||||||
|
info_ru: Максимальное время ожидания попытки соединения с OSD.
|
||||||
|
- name: osd_idle_timeout
|
||||||
|
type: sec
|
||||||
|
min: 1
|
||||||
|
default: 5
|
||||||
|
info: |
|
||||||
|
OSD connection inactivity time after which clients and other OSDs send
|
||||||
|
keepalive requests to check state of the connection.
|
||||||
|
info_ru: |
|
||||||
|
Время неактивности соединения с OSD, после которого клиенты или другие OSD
|
||||||
|
посылают запрос проверки состояния соединения.
|
||||||
|
- name: osd_ping_timeout
|
||||||
|
type: sec
|
||||||
|
min: 1
|
||||||
|
default: 5
|
||||||
|
info: |
|
||||||
|
Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
|
||||||
|
within this time, the connection to it is dropped and a reconnection attempt
|
||||||
|
is scheduled.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное время ожидания ответа на запрос проверки состояния соединения.
|
||||||
|
Если OSD не отвечает за это время, соединение отключается и производится
|
||||||
|
повторная попытка соединения.
|
||||||
|
- name: up_wait_retry_interval
|
||||||
|
type: ms
|
||||||
|
min: 50
|
||||||
|
default: 500
|
||||||
|
info: |
|
||||||
|
OSDs respond to clients with a special error code when they receive I/O
|
||||||
|
requests for a PG that's not synchronized and started. This parameter sets
|
||||||
|
the time for the clients to wait before re-attempting such I/O requests.
|
||||||
|
info_ru: |
|
||||||
|
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
||||||
|
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
|
||||||
|
они отвечают клиентам специальным кодом ошибки, означающим, что клиент
|
||||||
|
должен некоторое время подождать перед повторением запроса. Именно это время
|
||||||
|
ожидания задаёт данный параметр.
|
||||||
|
- name: max_etcd_attempts
|
||||||
|
type: int
|
||||||
|
default: 5
|
||||||
|
info: |
|
||||||
|
Maximum number of attempts for etcd requests which can't be retried
|
||||||
|
indefinitely.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число попыток выполнения запросов к etcd для тех запросов,
|
||||||
|
которые нельзя повторять бесконечно.
|
||||||
|
- name: etcd_quick_timeout
|
||||||
|
type: ms
|
||||||
|
default: 1000
|
||||||
|
info: |
|
||||||
|
Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное время выполнения запросов к etcd, которые должны завершаться
|
||||||
|
быстро, таких, как обновление резервации (lease).
|
||||||
|
- name: etcd_slow_timeout
|
||||||
|
type: ms
|
||||||
|
default: 5000
|
||||||
|
info: Timeout for etcd requests which are allowed to wait for some time.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное время выполнения запросов к etcd, для которых не обязательно
|
||||||
|
гарантировать быстрое выполнение.
|
||||||
|
- name: etcd_keepalive_timeout
|
||||||
|
type: sec
|
||||||
|
default: max(30, etcd_report_interval*2)
|
||||||
|
info: |
|
||||||
|
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
|
||||||
|
etcd_report_interval to guarantee that keepalive actually works.
|
||||||
|
info_ru: |
|
||||||
|
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
|
||||||
|
etcd_report_interval, чтобы keepalive гарантированно работал.
|
||||||
|
- name: etcd_ws_keepalive_timeout
|
||||||
|
type: sec
|
||||||
|
default: 30
|
||||||
|
info: |
|
||||||
|
etcd websocket ping interval required to keep the connection alive and
|
||||||
|
detect disconnections quickly.
|
||||||
|
info_ru: |
|
||||||
|
Интервал проверки живости вебсокет-подключений к etcd.
|
341
docs/params/osd.yml
Normal file
341
docs/params/osd.yml
Normal file
@@ -0,0 +1,341 @@
|
|||||||
|
- name: etcd_report_interval
|
||||||
|
type: sec
|
||||||
|
default: 5
|
||||||
|
info: |
|
||||||
|
Interval at which OSDs report their state to etcd. Affects OSD lease time
|
||||||
|
and thus the failover speed. Lease time is equal to this parameter value
|
||||||
|
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
|
||||||
|
that every OSD always refreshes its lease in time.
|
||||||
|
info_ru: |
|
||||||
|
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
|
||||||
|
влияет на время резервации (lease) OSD и поэтому на скорость переключения
|
||||||
|
при падении OSD. Время lease равняется значению этого параметра плюс
|
||||||
|
max_etcd_attempts * etcd_quick_timeout.
|
||||||
|
- name: run_primary
|
||||||
|
type: bool
|
||||||
|
default: true
|
||||||
|
info: |
|
||||||
|
Start primary OSD logic on this OSD. As of now, can be turned off only for
|
||||||
|
debugging purposes. It's possible to implement additional feature for the
|
||||||
|
monitor which may allow to separate primary and secondary OSDs, but it's
|
||||||
|
unclear why anyone could need it, so it's not implemented.
|
||||||
|
info_ru: |
|
||||||
|
Запускать логику первичного OSD на данном OSD. На данный момент отключать
|
||||||
|
эту опцию может иметь смысл только в целях отладки. В теории, можно
|
||||||
|
реализовать дополнительный режим для монитора, который позволит отделять
|
||||||
|
первичные OSD от вторичных, но пока не понятно, зачем это может кому-то
|
||||||
|
понадобиться, поэтому это не реализовано.
|
||||||
|
- name: osd_network
|
||||||
|
type: string or array of strings
|
||||||
|
type_ru: строка или массив строк
|
||||||
|
info: |
|
||||||
|
Network mask of the network (IPv4 or IPv6) to use for OSDs. Note that
|
||||||
|
although it's possible to specify multiple networks here, this does not
|
||||||
|
mean that OSDs will create multiple listening sockets - they'll only
|
||||||
|
pick the first matching address of an UP + RUNNING interface. Separate
|
||||||
|
networks for cluster and client connections are also not implemented, but
|
||||||
|
they are mostly useless anyway, so it's not a big deal.
|
||||||
|
info_ru: |
|
||||||
|
Маска подсети (IPv4 или IPv6) для использования для соединений с OSD.
|
||||||
|
Имейте в виду, что хотя сейчас и можно передать в этот параметр несколько
|
||||||
|
подсетей, это не означает, что OSD будут создавать несколько слушающих
|
||||||
|
сокетов - они лишь будут выбирать адрес первого поднятого (состояние UP +
|
||||||
|
RUNNING), подходящий под заданную маску. Также не реализовано разделение
|
||||||
|
кластерной и публичной сетей OSD. Правда, от него обычно всё равно довольно
|
||||||
|
мало толку, так что особенной проблемы в этом нет.
|
||||||
|
- name: bind_address
|
||||||
|
type: string
|
||||||
|
default: "0.0.0.0"
|
||||||
|
info: |
|
||||||
|
Instead of the network mask, you can also set OSD listen address explicitly
|
||||||
|
using this parameter. May be useful if you want to start OSDs on interfaces
|
||||||
|
that are not UP + RUNNING.
|
||||||
|
info_ru: |
|
||||||
|
Этим параметром можно явным образом задать адрес, на котором будет ожидать
|
||||||
|
соединений OSD (вместо использования маски подсети). Может быть полезно,
|
||||||
|
например, чтобы запускать OSD на неподнятых интерфейсах (не UP + RUNNING).
|
||||||
|
- name: bind_port
|
||||||
|
type: int
|
||||||
|
info: |
|
||||||
|
By default, OSDs pick random ports to use for incoming connections
|
||||||
|
automatically. With this option you can set a specific port for a specific
|
||||||
|
OSD by hand.
|
||||||
|
info_ru: |
|
||||||
|
По умолчанию OSD сами выбирают случайные порты для входящих подключений.
|
||||||
|
С помощью данной опции вы можете задать порт для отдельного OSD вручную.
|
||||||
|
- name: autosync_interval
|
||||||
|
type: sec
|
||||||
|
default: 5
|
||||||
|
info: |
|
||||||
|
Time interval at which automatic fsyncs/flushes are issued by each OSD when
|
||||||
|
the immediate_commit mode if disabled. fsyncs are required because without
|
||||||
|
them OSDs quickly fill their journals, become unable to clear them and
|
||||||
|
stall. Also this option limits the amount of recent uncommitted changes
|
||||||
|
which OSDs may lose in case of a power outage in case when clients don't
|
||||||
|
issue fsyncs at all.
|
||||||
|
info_ru: |
|
||||||
|
Временной интервал отправки автоматических fsync-ов (операций очистки кэша)
|
||||||
|
каждым OSD для случая, когда режим immediate_commit отключён. fsync-и нужны
|
||||||
|
OSD, чтобы успевать очищать журнал - без них OSD быстро заполняют журналы и
|
||||||
|
перестают обрабатывать операции записи. Также эта опция ограничивает объём
|
||||||
|
недавних незафиксированных изменений, которые OSD могут терять при
|
||||||
|
отключении питания, если клиенты вообще не отправляют fsync.
|
||||||
|
- name: autosync_writes
|
||||||
|
type: int
|
||||||
|
default: 128
|
||||||
|
info: |
|
||||||
|
Same as autosync_interval, but sets the maximum number of uncommitted write
|
||||||
|
operations before issuing an fsync operation internally.
|
||||||
|
info_ru: |
|
||||||
|
Аналогично autosync_interval, но задаёт не временной интервал, а
|
||||||
|
максимальное количество незафиксированных операций записи перед
|
||||||
|
принудительной отправкой fsync-а.
|
||||||
|
- name: recovery_queue_depth
|
||||||
|
type: int
|
||||||
|
default: 4
|
||||||
|
info: |
|
||||||
|
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||||
|
Currently it's the only parameter available to tune the speed or recovery
|
||||||
|
and rebalancing, but it's planned to implement more.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число операций восстановления на одном первичном OSD в любой
|
||||||
|
момент времени. На данный момент единственный параметр, который можно менять
|
||||||
|
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||||
|
в планах реализация других параметров.
|
||||||
|
- name: recovery_sync_batch
|
||||||
|
type: int
|
||||||
|
default: 16
|
||||||
|
info: Maximum number of recovery operations before issuing an additional fsync.
|
||||||
|
info_ru: Максимальное число операций восстановления перед дополнительным fsync.
|
||||||
|
- name: readonly
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Read-only mode. If this is enabled, an OSD will never issue any writes to
|
||||||
|
the underlying device. This may be useful for recovery purposes.
|
||||||
|
info_ru: |
|
||||||
|
Режим "только чтение". Если включить этот режим, OSD не будет писать ничего
|
||||||
|
на диск. Может быть полезно в целях восстановления.
|
||||||
|
- name: no_recovery
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Disable automatic background recovery of objects. Note that it doesn't
|
||||||
|
affect implicit recovery of objects happening during writes - a write is
|
||||||
|
always made to a full set of at least pg_minsize OSDs.
|
||||||
|
info_ru: |
|
||||||
|
Отключить автоматическое фоновое восстановление объектов. Обратите внимание,
|
||||||
|
что эта опция не отключает восстановление объектов, происходящее при
|
||||||
|
записи - запись всегда производится в полный набор из как минимум pg_minsize
|
||||||
|
OSD.
|
||||||
|
- name: no_rebalance
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Disable background movement of data between different OSDs. Disabling it
|
||||||
|
means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||||
|
info_ru: |
|
||||||
|
Отключить фоновое перемещение объектов между разными OSD. Отключение
|
||||||
|
означает, что PG, находящиеся в состоянии `has_misplaced`, будут оставлены
|
||||||
|
в нём на неопределённый срок.
|
||||||
|
- name: print_stats_interval
|
||||||
|
type: sec
|
||||||
|
default: 3
|
||||||
|
info: |
|
||||||
|
Time interval at which OSDs print simple human-readable operation
|
||||||
|
statistics on stdout.
|
||||||
|
info_ru: |
|
||||||
|
Временной интервал, с которым OSD печатают простую человекочитаемую
|
||||||
|
статистику выполнения операций в стандартный вывод.
|
||||||
|
- name: slow_log_interval
|
||||||
|
type: sec
|
||||||
|
default: 10
|
||||||
|
info: |
|
||||||
|
Time interval at which OSDs dump slow or stuck operations on stdout, if
|
||||||
|
they're any. Also it's the time after which an operation is considered
|
||||||
|
"slow".
|
||||||
|
info_ru: |
|
||||||
|
Временной интервал, с которым OSD выводят в стандартный вывод список
|
||||||
|
медленных или зависших операций, если таковые имеются. Также время, при
|
||||||
|
превышении которого операция считается "медленной".
|
||||||
|
- name: max_write_iodepth
|
||||||
|
type: int
|
||||||
|
default: 128
|
||||||
|
info: |
|
||||||
|
Parallel client write operation limit per one OSD. Operations that exceed
|
||||||
|
this limit are pushed to a temporary queue instead of being executed
|
||||||
|
immediately.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число одновременных клиентских операций записи на один OSD.
|
||||||
|
Операции, превышающие этот лимит, не исполняются сразу, а сохраняются во
|
||||||
|
временной очереди.
|
||||||
|
- name: min_flusher_count
|
||||||
|
type: int
|
||||||
|
default: 1
|
||||||
|
info: |
|
||||||
|
Flusher is a micro-thread that moves data from the journal to the data
|
||||||
|
area of the device. Their number is auto-tuned between minimum and maximum.
|
||||||
|
Minimum number is set by this parameter.
|
||||||
|
info_ru: |
|
||||||
|
Flusher - это микро-поток (корутина), которая копирует данные из журнала в
|
||||||
|
основную область устройства данных. Их число настраивается динамически между
|
||||||
|
минимальным и максимальным значением. Этот параметр задаёт минимальное число.
|
||||||
|
- name: max_flusher_count
|
||||||
|
type: int
|
||||||
|
default: 256
|
||||||
|
info: |
|
||||||
|
Maximum number of journal flushers (see above min_flusher_count).
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число микро-потоков очистки журнала (см. выше min_flusher_count).
|
||||||
|
- name: inmemory_metadata
|
||||||
|
type: bool
|
||||||
|
default: true
|
||||||
|
info: |
|
||||||
|
This parameter makes Vitastor always keep metadata area of the block device
|
||||||
|
in memory. It's required for good performance because it allows to avoid
|
||||||
|
additional read-modify-write cycles during metadata modifications. Metadata
|
||||||
|
area size is currently roughly 224 MB per 1 TB of data. You can turn it off
|
||||||
|
to reduce memory usage by this value, but it will hurt performance. This
|
||||||
|
restriction is likely to be removed in the future along with the upgrade
|
||||||
|
of the metadata storage scheme.
|
||||||
|
info_ru: |
|
||||||
|
Данный параметр заставляет Vitastor всегда держать область метаданных диска
|
||||||
|
в памяти. Это нужно, чтобы избегать дополнительных операций чтения с диска
|
||||||
|
при записи. Размер области метаданных на данный момент составляет примерно
|
||||||
|
224 МБ на 1 ТБ данных. При включении потребление памяти снизится примерно
|
||||||
|
на эту величину, но при этом также снизится и производительность. В будущем,
|
||||||
|
после обновления схемы хранения метаданных, это ограничение, скорее всего,
|
||||||
|
будет ликвидировано.
|
||||||
|
- name: inmemory_journal
|
||||||
|
type: bool
|
||||||
|
default: true
|
||||||
|
info: |
|
||||||
|
This parameter make Vitastor always keep journal area of the block
|
||||||
|
device in memory. Turning it off will, again, reduce memory usage, but
|
||||||
|
hurt performance because flusher coroutines will have to read data from
|
||||||
|
the disk back before copying it into the main area. The memory usage benefit
|
||||||
|
is typically very small because it's sufficient to have 16-32 MB journal
|
||||||
|
for SSD OSDs. However, in theory it's possible that you'll want to turn it
|
||||||
|
off for hybrid (HDD+SSD) OSDs with large journals on quick devices.
|
||||||
|
info_ru: |
|
||||||
|
Данный параметр заставляет Vitastor всегда держать в памяти журналы OSD.
|
||||||
|
Отключение параметра, опять же, снижает потребление памяти, но ухудшает
|
||||||
|
производительность, так как для копирования данных из журнала в основную
|
||||||
|
область устройства OSD будут вынуждены читать их обратно с диска. Выигрыш
|
||||||
|
по памяти при этом обычно крайне низкий, так как для SSD OSD обычно
|
||||||
|
достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
|
||||||
|
параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
|
||||||
|
журналами, расположенными на быстром по сравнению с HDD устройстве.
|
||||||
|
- name: journal_sector_buffer_count
|
||||||
|
type: int
|
||||||
|
default: 32
|
||||||
|
info: |
|
||||||
|
Maximum number of buffers that can be used for writing journal metadata
|
||||||
|
blocks. The only situation when you should increase it to a larger value
|
||||||
|
is when you enable journal_no_same_sector_overwrites. In this case set
|
||||||
|
it to, for example, 1024.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число буферов, разрешённых для использования под записываемые
|
||||||
|
в журнал блоки метаданных. Единственная ситуация, в которой этот параметр
|
||||||
|
нужно менять - это если вы включаете journal_no_same_sector_overwrites. В
|
||||||
|
этом случае установите данный параметр, например, в 1024.
|
||||||
|
- name: journal_no_same_sector_overwrites
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Enable this option for SSDs like Intel D3-S4510 and D3-S4610 which REALLY
|
||||||
|
don't like when a program overwrites the same sector multiple times in a
|
||||||
|
row and slow down significantly (from 25000+ iops to ~3000 iops). When
|
||||||
|
this option is set, Vitastor will always move to the next sector of the
|
||||||
|
journal after writing it instead of possibly overwriting it the second time.
|
||||||
|
info_ru: |
|
||||||
|
Включайте данную опцию для SSD вроде Intel D3-S4510 и D3-S4610, которые
|
||||||
|
ОЧЕНЬ не любят, когда ПО перезаписывает один и тот же сектор несколько раз
|
||||||
|
подряд. Такие SSD при многократной перезаписи одного и того же сектора
|
||||||
|
сильно замедляются - условно, с 25000 и более iops до 3000 iops. Когда
|
||||||
|
данная опция установлена, Vitastor всегда переходит к следующему сектору
|
||||||
|
журнала после записи вместо потенциально повторной перезаписи того же
|
||||||
|
самого сектора.
|
||||||
|
- name: throttle_small_writes
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Enable soft throttling of small journaled writes. Useful for hybrid OSDs
|
||||||
|
with fast journal/metadata devices and slow data devices. The idea is that
|
||||||
|
small writes complete very quickly because they're first written to the
|
||||||
|
journal device, but moving them to the main device is slow. So if an OSD
|
||||||
|
allows clients to issue a lot of small writes it will perform very good
|
||||||
|
for several seconds and then the journal will fill up and the performance
|
||||||
|
will drop to almost zero. Throttling is meant to prevent this problem by
|
||||||
|
artifically slowing quick writes down based on the amount of free space in
|
||||||
|
the journal. When throttling is used, the performance of small writes will
|
||||||
|
decrease smoothly instead of abrupt drop at the moment when the journal
|
||||||
|
fills up.
|
||||||
|
info_ru: |
|
||||||
|
Разрешить мягкое ограничение скорости журналируемой записи. Полезно для
|
||||||
|
гибридных OSD с быстрыми устройствами метаданных и медленными устройствами
|
||||||
|
данных. Идея заключается в том, что мелкие записи в этой ситуации могут
|
||||||
|
завершаться очень быстро, так как они изначально записываются на быстрое
|
||||||
|
журнальное устройство (SSD). Но перемещать их потом на основное медленное
|
||||||
|
устройство долго. Поэтому если OSD быстро примет от клиентов очень много
|
||||||
|
мелких операций записи, он быстро заполнит свой журнал, после чего
|
||||||
|
производительность записи резко упадёт практически до нуля. Ограничение
|
||||||
|
скорости записи призвано решить эту проблему с помощью искусственного
|
||||||
|
замедления операций записи на основании объёма свободного места в журнале.
|
||||||
|
Когда эта опция включена, производительность мелких операций записи будет
|
||||||
|
снижаться плавно, а не резко в момент окончательного заполнения журнала.
|
||||||
|
- name: throttle_target_iops
|
||||||
|
type: int
|
||||||
|
default: 100
|
||||||
|
info: |
|
||||||
|
Target maximum number of throttled operations per second under the condition
|
||||||
|
of full journal. Set it to approximate random write iops of your data devices
|
||||||
|
(HDDs).
|
||||||
|
info_ru: |
|
||||||
|
Расчётное максимальное число ограничиваемых операций в секунду при условии
|
||||||
|
отсутствия свободного места в журнале. Устанавливайте приблизительно равным
|
||||||
|
максимальной производительности случайной записи ваших устройств данных
|
||||||
|
(HDD) в операциях в секунду.
|
||||||
|
- name: throttle_target_mbs
|
||||||
|
type: int
|
||||||
|
default: 100
|
||||||
|
info: |
|
||||||
|
Target maximum bandwidth in MB/s of throttled operations per second under
|
||||||
|
the condition of full journal. Set it to approximate linear write
|
||||||
|
performance of your data devices (HDDs).
|
||||||
|
info_ru: |
|
||||||
|
Расчётный максимальный размер в МБ/с ограничиваемых операций в секунду при
|
||||||
|
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||||
|
равным максимальной производительности линейной записи ваших устройств
|
||||||
|
данных (HDD).
|
||||||
|
- name: throttle_target_parallelism
|
||||||
|
type: int
|
||||||
|
default: 1
|
||||||
|
info: |
|
||||||
|
Target maximum parallelism of throttled operations under the condition of
|
||||||
|
full journal. Set it to approximate internal parallelism of your data
|
||||||
|
devices (1 for HDDs, 4-8 for SSDs).
|
||||||
|
info_ru: |
|
||||||
|
Расчётный максимальный параллелизм ограничиваемых операций в секунду при
|
||||||
|
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||||
|
равным внутреннему параллелизму ваших устройств данных (1 для HDD, 4-8
|
||||||
|
для SSD).
|
||||||
|
- name: throttle_threshold_us
|
||||||
|
type: us
|
||||||
|
default: 50
|
||||||
|
info: |
|
||||||
|
Minimal computed delay to be applied to throttled operations. Usually
|
||||||
|
doesn't need to be changed.
|
||||||
|
info_ru: |
|
||||||
|
Минимальная применимая к ограничиваемым операциям задержка. Обычно не
|
||||||
|
требует изменений.
|
||||||
|
- name: osd_memlock
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: >
|
||||||
|
Lock all OSD memory to prevent it from being unloaded into swap with
|
||||||
|
mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||||
|
info_ru: >
|
||||||
|
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
|
||||||
|
в пространство подкачки. Требует достаточного значения ulimit -l (лимита
|
||||||
|
заблокированной памяти).
|
39
mon/mon.js
39
mon/mon.js
@@ -83,13 +83,13 @@ const etcd_tree = {
|
|||||||
osd_idle_timeout: 5, // seconds. min: 1
|
osd_idle_timeout: 5, // seconds. min: 1
|
||||||
osd_ping_timeout: 5, // seconds. min: 1
|
osd_ping_timeout: 5, // seconds. min: 1
|
||||||
up_wait_retry_interval: 500, // ms. min: 50
|
up_wait_retry_interval: 500, // ms. min: 50
|
||||||
// osd
|
|
||||||
etcd_report_interval: 5, // seconds
|
|
||||||
max_etcd_attempts: 5,
|
max_etcd_attempts: 5,
|
||||||
etcd_quick_timeout: 1000, // ms
|
etcd_quick_timeout: 1000, // ms
|
||||||
etcd_slow_timeout: 5000, // ms
|
etcd_slow_timeout: 5000, // ms
|
||||||
etcd_keepalive_timeout: 30, // seconds, default is min(30, etcd_report_interval*2)
|
etcd_keepalive_timeout: 30, // seconds, default is max(30, etcd_report_interval*2)
|
||||||
etcd_ws_keepalive_interval: 30, // seconds
|
etcd_ws_keepalive_interval: 30, // seconds
|
||||||
|
// osd
|
||||||
|
etcd_report_interval: 5, // seconds
|
||||||
run_primary: true,
|
run_primary: true,
|
||||||
osd_network: null, // "192.168.7.0/24" or an array of masks
|
osd_network: null, // "192.168.7.0/24" or an array of masks
|
||||||
bind_address: "0.0.0.0",
|
bind_address: "0.0.0.0",
|
||||||
@@ -104,6 +104,7 @@ const etcd_tree = {
|
|||||||
no_rebalance: false,
|
no_rebalance: false,
|
||||||
print_stats_interval: 3,
|
print_stats_interval: 3,
|
||||||
slow_log_interval: 10,
|
slow_log_interval: 10,
|
||||||
|
osd_memlock: false,
|
||||||
// blockstore - fixed in superblock
|
// blockstore - fixed in superblock
|
||||||
block_size,
|
block_size,
|
||||||
disk_alignment,
|
disk_alignment,
|
||||||
@@ -130,6 +131,11 @@ const etcd_tree = {
|
|||||||
inmemory_journal,
|
inmemory_journal,
|
||||||
journal_sector_buffer_count,
|
journal_sector_buffer_count,
|
||||||
journal_no_same_sector_overwrites,
|
journal_no_same_sector_overwrites,
|
||||||
|
throttle_small_writes: false,
|
||||||
|
throttle_target_iops: 100,
|
||||||
|
throttle_target_mbs: 100,
|
||||||
|
throttle_target_parallelism: 1,
|
||||||
|
throttle_threshold_us: 50,
|
||||||
}, */
|
}, */
|
||||||
global: {},
|
global: {},
|
||||||
/* node_placement: {
|
/* node_placement: {
|
||||||
@@ -1339,21 +1345,30 @@ class Mon
|
|||||||
const tm = prev_stats ? BigInt(timestamp - prev_stats.timestamp) : 0;
|
const tm = prev_stats ? BigInt(timestamp - prev_stats.timestamp) : 0;
|
||||||
for (const op in op_stats)
|
for (const op in op_stats)
|
||||||
{
|
{
|
||||||
op_stats[op].bps = prev_stats ? (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm : 0;
|
if (prev_stats && prev_stats.op_stats && prev_stats.op_stats[op])
|
||||||
op_stats[op].iops = prev_stats ? (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm : 0;
|
{
|
||||||
op_stats[op].lat = prev_stats ? (op_stats[op].usec - prev_stats.op_stats[op].usec)
|
op_stats[op].bps = (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm;
|
||||||
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n) : 0;
|
op_stats[op].iops = (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm;
|
||||||
|
op_stats[op].lat = (op_stats[op].usec - prev_stats.op_stats[op].usec)
|
||||||
|
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (const op in subop_stats)
|
for (const op in subop_stats)
|
||||||
{
|
{
|
||||||
subop_stats[op].iops = prev_stats ? (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm : 0;
|
if (prev_stats && prev_stats.subop_stats && prev_stats.subop_stats[op])
|
||||||
subop_stats[op].lat = prev_stats ? (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
|
{
|
||||||
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n) : 0;
|
subop_stats[op].iops = (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm;
|
||||||
|
subop_stats[op].lat = (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
|
||||||
|
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (const op in recovery_stats)
|
for (const op in recovery_stats)
|
||||||
{
|
{
|
||||||
recovery_stats[op].bps = prev_stats ? (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm : 0;
|
if (prev_stats && prev_stats.recovery_stats && prev_stats.recovery_stats[op])
|
||||||
recovery_stats[op].iops = prev_stats ? (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm : 0;
|
{
|
||||||
|
recovery_stats[op].bps = (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm;
|
||||||
|
recovery_stats[op].iops = (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return { op_stats, subop_stats, recovery_stats };
|
return { op_stats, subop_stats, recovery_stats };
|
||||||
}
|
}
|
||||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '0.6.12'
|
VERSION = '0.6.15'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@@ -25,4 +25,4 @@ rm fio
|
|||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-0.6.12/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.12$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-0.6.15/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.15$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -34,7 +34,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.6.12.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.6.15.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.6.12
|
Version: 0.6.15
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.6.12.el7.tar.gz
|
Source0: vitastor-0.6.15.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -33,7 +33,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.6.12.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.6.15.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.6.12
|
Version: 0.6.15
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.6.12.el8.tar.gz
|
Source0: vitastor-0.6.15.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="0.6.12")
|
add_definitions(-DVERSION="0.6.15")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
@@ -155,7 +155,7 @@ target_link_libraries(vitastor-nbd
|
|||||||
# vitastor-cli
|
# vitastor-cli
|
||||||
add_executable(vitastor-cli
|
add_executable(vitastor-cli
|
||||||
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp
|
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp
|
||||||
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
|
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm_data.cpp cli_rm.cpp
|
||||||
)
|
)
|
||||||
target_link_libraries(vitastor-cli
|
target_link_libraries(vitastor-cli
|
||||||
vitastor_client
|
vitastor_client
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
#include <sys/socket.h>
|
||||||
|
#include <unistd.h>
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
#include <net/if.h>
|
#include <net/if.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
@@ -9,7 +11,7 @@
|
|||||||
|
|
||||||
#include "addr_util.h"
|
#include "addr_util.h"
|
||||||
|
|
||||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr)
|
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr)
|
||||||
{
|
{
|
||||||
if (parse_port)
|
if (parse_port)
|
||||||
{
|
{
|
||||||
@@ -25,7 +27,7 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
|
|||||||
}
|
}
|
||||||
if (inet_pton(AF_INET, str.c_str(), &((struct sockaddr_in*)addr)->sin_addr) == 1)
|
if (inet_pton(AF_INET, str.c_str(), &((struct sockaddr_in*)addr)->sin_addr) == 1)
|
||||||
{
|
{
|
||||||
addr->sa_family = AF_INET;
|
addr->ss_family = AF_INET;
|
||||||
((struct sockaddr_in*)addr)->sin_port = htons(default_port);
|
((struct sockaddr_in*)addr)->sin_port = htons(default_port);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -33,30 +35,30 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
|
|||||||
str = str.substr(1, str.length()-2);
|
str = str.substr(1, str.length()-2);
|
||||||
if (inet_pton(AF_INET6, str.c_str(), &((struct sockaddr_in6*)addr)->sin6_addr) == 1)
|
if (inet_pton(AF_INET6, str.c_str(), &((struct sockaddr_in6*)addr)->sin6_addr) == 1)
|
||||||
{
|
{
|
||||||
addr->sa_family = AF_INET6;
|
addr->ss_family = AF_INET6;
|
||||||
((struct sockaddr_in6*)addr)->sin6_port = htons(default_port);
|
((struct sockaddr_in6*)addr)->sin6_port = htons(default_port);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string addr_to_string(const sockaddr &addr)
|
std::string addr_to_string(const sockaddr_storage &addr)
|
||||||
{
|
{
|
||||||
char peer_str[256];
|
char peer_str[256];
|
||||||
bool ok = false;
|
bool ok = false;
|
||||||
int port;
|
int port;
|
||||||
if (addr.sa_family == AF_INET)
|
if (addr.ss_family == AF_INET)
|
||||||
{
|
{
|
||||||
ok = !!inet_ntop(AF_INET, &((sockaddr_in*)&addr)->sin_addr, peer_str, 256);
|
ok = !!inet_ntop(AF_INET, &((sockaddr_in*)&addr)->sin_addr, peer_str, 256);
|
||||||
port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||||
}
|
}
|
||||||
else if (addr.sa_family == AF_INET6)
|
else if (addr.ss_family == AF_INET6)
|
||||||
{
|
{
|
||||||
ok = !!inet_ntop(AF_INET6, &((sockaddr_in6*)&addr)->sin6_addr, peer_str, 256);
|
ok = !!inet_ntop(AF_INET6, &((sockaddr_in6*)&addr)->sin6_addr, peer_str, 256);
|
||||||
port = ntohs(((sockaddr_in6*)&addr)->sin6_port);
|
port = ntohs(((sockaddr_in6*)&addr)->sin6_port);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
throw std::runtime_error("Unknown address family "+std::to_string(addr.sa_family));
|
throw std::runtime_error("Unknown address family "+std::to_string(addr.ss_family));
|
||||||
if (!ok)
|
if (!ok)
|
||||||
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
|
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
|
||||||
return std::string(peer_str)+":"+std::to_string(port);
|
return std::string(peer_str)+":"+std::to_string(port);
|
||||||
@@ -186,3 +188,51 @@ std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool
|
|||||||
freeifaddrs(list);
|
freeifaddrs(list);
|
||||||
return addresses;
|
return addresses;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port)
|
||||||
|
{
|
||||||
|
sockaddr_storage addr;
|
||||||
|
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
||||||
|
{
|
||||||
|
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
||||||
|
}
|
||||||
|
|
||||||
|
int listen_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||||
|
if (listen_fd < 0)
|
||||||
|
{
|
||||||
|
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
||||||
|
}
|
||||||
|
int enable = 1;
|
||||||
|
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
||||||
|
|
||||||
|
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||||
|
{
|
||||||
|
close(listen_fd);
|
||||||
|
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
||||||
|
}
|
||||||
|
if (listening_port)
|
||||||
|
{
|
||||||
|
if (bind_port == 0)
|
||||||
|
{
|
||||||
|
socklen_t len = sizeof(addr);
|
||||||
|
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
|
||||||
|
{
|
||||||
|
close(listen_fd);
|
||||||
|
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
|
||||||
|
}
|
||||||
|
*listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*listening_port = bind_port;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (listen(listen_fd, listen_backlog ? listen_backlog : 128) < 0)
|
||||||
|
{
|
||||||
|
close(listen_fd);
|
||||||
|
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
||||||
|
}
|
||||||
|
|
||||||
|
return listen_fd;
|
||||||
|
}
|
||||||
|
@@ -4,6 +4,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr);
|
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
|
||||||
std::string addr_to_string(const sockaddr &addr);
|
std::string addr_to_string(const sockaddr_storage &addr);
|
||||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
|
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
|
||||||
|
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);
|
||||||
|
@@ -21,7 +21,7 @@
|
|||||||
// Memory alignment for direct I/O (usually 512 bytes)
|
// Memory alignment for direct I/O (usually 512 bytes)
|
||||||
// All other alignments must be a multiple of this one
|
// All other alignments must be a multiple of this one
|
||||||
#ifndef MEM_ALIGNMENT
|
#ifndef MEM_ALIGNMENT
|
||||||
#define MEM_ALIGNMENT 512
|
#define MEM_ALIGNMENT 4096
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Default block size is 128 KB, current allowed range is 4K - 128M
|
// Default block size is 128 KB, current allowed range is 4K - 128M
|
||||||
|
@@ -415,8 +415,11 @@ stop_flusher:
|
|||||||
flusher->active_flushers++;
|
flusher->active_flushers++;
|
||||||
resume_1:
|
resume_1:
|
||||||
// Find it in clean_db
|
// Find it in clean_db
|
||||||
clean_it = bs->clean_db.find(cur.oid);
|
{
|
||||||
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||||
|
auto clean_it = clean_db.find(cur.oid);
|
||||||
|
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||||
|
}
|
||||||
// Scan dirty versions of the object
|
// Scan dirty versions of the object
|
||||||
if (!scan_dirty(1))
|
if (!scan_dirty(1))
|
||||||
{
|
{
|
||||||
@@ -870,10 +873,11 @@ void journal_flusher_co::update_clean_db()
|
|||||||
#endif
|
#endif
|
||||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||||
}
|
}
|
||||||
|
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||||
if (has_delete)
|
if (has_delete)
|
||||||
{
|
{
|
||||||
auto clean_it = bs->clean_db.find(cur.oid);
|
auto clean_it = clean_db.find(cur.oid);
|
||||||
bs->clean_db.erase(clean_it);
|
clean_db.erase(clean_it);
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
|
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
|
||||||
clean_loc >> bs->block_order,
|
clean_loc >> bs->block_order,
|
||||||
@@ -884,7 +888,7 @@ void journal_flusher_co::update_clean_db()
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bs->clean_db[cur.oid] = {
|
clean_db[cur.oid] = {
|
||||||
.version = cur.version,
|
.version = cur.version,
|
||||||
.location = clean_loc,
|
.location = clean_loc,
|
||||||
};
|
};
|
||||||
|
@@ -49,7 +49,6 @@ class journal_flusher_co
|
|||||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||||
|
|
||||||
bool skip_copy, has_delete, has_writes;
|
bool skip_copy, has_delete, has_writes;
|
||||||
blockstore_clean_db_t::iterator clean_it;
|
|
||||||
std::vector<copy_buffer_t> v;
|
std::vector<copy_buffer_t> v;
|
||||||
std::vector<copy_buffer_t>::iterator it;
|
std::vector<copy_buffer_t>::iterator it;
|
||||||
int copy_count;
|
int copy_count;
|
||||||
|
@@ -118,7 +118,7 @@ void blockstore_impl_t::loop()
|
|||||||
// has_writes == 0 - no writes before the current queue item
|
// has_writes == 0 - no writes before the current queue item
|
||||||
// has_writes == 1 - some writes in progress
|
// has_writes == 1 - some writes in progress
|
||||||
// has_writes == 2 - tried to submit some writes, but failed
|
// has_writes == 2 - tried to submit some writes, but failed
|
||||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
|
||||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||||
{
|
{
|
||||||
auto op = submit_queue[op_idx];
|
auto op = submit_queue[op_idx];
|
||||||
@@ -142,7 +142,6 @@ void blockstore_impl_t::loop()
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unsigned ring_space = ringloop->space_left();
|
|
||||||
unsigned prev_sqe_pos = ringloop->save();
|
unsigned prev_sqe_pos = ringloop->save();
|
||||||
// 0 = can't submit
|
// 0 = can't submit
|
||||||
// 1 = in progress
|
// 1 = in progress
|
||||||
@@ -199,9 +198,14 @@ void blockstore_impl_t::loop()
|
|||||||
}
|
}
|
||||||
else if (op->opcode == BS_OP_LIST)
|
else if (op->opcode == BS_OP_LIST)
|
||||||
{
|
{
|
||||||
// LIST doesn't need to be blocked by previous modifications
|
// LIST doesn't have to be blocked by previous modifications
|
||||||
process_list(op);
|
// But don't do a lot of LISTs at once, because they're blocking and potentially slow
|
||||||
wr_st = 2;
|
if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
|
||||||
|
{
|
||||||
|
process_list(op);
|
||||||
|
done_lists++;
|
||||||
|
wr_st = 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (wr_st == 2)
|
if (wr_st == 2)
|
||||||
{
|
{
|
||||||
@@ -212,7 +216,6 @@ void blockstore_impl_t::loop()
|
|||||||
ringloop->restore(prev_sqe_pos);
|
ringloop->restore(prev_sqe_pos);
|
||||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||||
{
|
{
|
||||||
PRIV(op)->wait_detail = 1 + ring_space;
|
|
||||||
// ring is full, stop submission
|
// ring is full, stop submission
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -282,7 +285,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||||
{
|
{
|
||||||
if (ringloop->space_left() < PRIV(op)->wait_detail)
|
if (ringloop->sqes_left() < PRIV(op)->wait_detail)
|
||||||
{
|
{
|
||||||
// stop submission if there's still no free space
|
// stop submission if there's still no free space
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
@@ -425,22 +428,104 @@ static bool replace_stable(object_id oid, uint64_t version, int search_start, in
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||||
|
{
|
||||||
|
uint64_t pg_num = 0;
|
||||||
|
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||||
|
auto sh_it = clean_db_settings.find(pool_id);
|
||||||
|
if (sh_it != clean_db_settings.end())
|
||||||
|
{
|
||||||
|
// like map_to_pg()
|
||||||
|
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||||
|
}
|
||||||
|
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||||
|
}
|
||||||
|
|
||||||
|
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||||
|
{
|
||||||
|
uint64_t pool_id = (uint64_t)pool;
|
||||||
|
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
|
||||||
|
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||||
|
while (sh_it != clean_db_shards.end() &&
|
||||||
|
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||||
|
{
|
||||||
|
for (auto & pair: sh_it->second)
|
||||||
|
{
|
||||||
|
// like map_to_pg()
|
||||||
|
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
|
||||||
|
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
||||||
|
new_shards[shard_id][pair.first] = pair.second;
|
||||||
|
}
|
||||||
|
clean_db_shards.erase(sh_it++);
|
||||||
|
}
|
||||||
|
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
|
||||||
|
{
|
||||||
|
auto & to = clean_db_shards[sh_it->first];
|
||||||
|
to.swap(sh_it->second);
|
||||||
|
}
|
||||||
|
clean_db_settings[pool_id] = (pool_shard_settings_t){
|
||||||
|
.pg_count = pg_count,
|
||||||
|
.pg_stripe_size = pg_stripe_size,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
uint32_t list_pg = op->offset;
|
uint32_t list_pg = op->offset+1;
|
||||||
uint32_t pg_count = op->len;
|
uint32_t pg_count = op->len;
|
||||||
uint64_t pg_stripe_size = op->oid.stripe;
|
uint64_t pg_stripe_size = op->oid.stripe;
|
||||||
uint64_t min_inode = op->oid.inode;
|
uint64_t min_inode = op->oid.inode;
|
||||||
uint64_t max_inode = op->version;
|
uint64_t max_inode = op->version;
|
||||||
// Check PG
|
// Check PG
|
||||||
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
|
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Copy clean_db entries (sorted)
|
// Check if the DB needs resharding
|
||||||
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
|
// (we don't know about PGs from the beginning, we only create "shards" here)
|
||||||
|
uint64_t first_shard = 0, last_shard = UINT64_MAX;
|
||||||
|
if (min_inode != 0 &&
|
||||||
|
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
|
||||||
|
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
|
||||||
|
{
|
||||||
|
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
|
||||||
|
if (pg_count > 1)
|
||||||
|
{
|
||||||
|
// Per-pg listing
|
||||||
|
auto sh_it = clean_db_settings.find(pool_id);
|
||||||
|
if (sh_it == clean_db_settings.end() ||
|
||||||
|
sh_it->second.pg_count != pg_count ||
|
||||||
|
sh_it->second.pg_stripe_size != pg_stripe_size)
|
||||||
|
{
|
||||||
|
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
|
||||||
|
}
|
||||||
|
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Per-pool listing
|
||||||
|
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
|
||||||
|
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Copy clean_db entries
|
||||||
|
int stable_count = 0, stable_alloc = 0;
|
||||||
|
if (min_inode != max_inode)
|
||||||
|
{
|
||||||
|
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||||
|
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||||
|
shard_it++)
|
||||||
|
{
|
||||||
|
auto & clean_db = shard_it->second;
|
||||||
|
stable_alloc += clean_db.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
stable_alloc = 32768;
|
||||||
|
}
|
||||||
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
|
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
|
||||||
if (!stable)
|
if (!stable)
|
||||||
{
|
{
|
||||||
@@ -448,7 +533,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||||
|
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||||
|
shard_it++)
|
||||||
{
|
{
|
||||||
|
auto & clean_db = shard_it->second;
|
||||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||||
{
|
{
|
||||||
@@ -463,26 +552,28 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
}
|
}
|
||||||
for (; clean_it != clean_end; clean_it++)
|
for (; clean_it != clean_end; clean_it++)
|
||||||
{
|
{
|
||||||
if (!pg_count || ((clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
if (stable_count >= stable_alloc)
|
||||||
{
|
{
|
||||||
if (stable_count >= stable_alloc)
|
stable_alloc *= 2;
|
||||||
|
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||||
|
if (!stable)
|
||||||
{
|
{
|
||||||
stable_alloc += 32768;
|
op->retval = -ENOMEM;
|
||||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
FINISH_OP(op);
|
||||||
if (!stable)
|
return;
|
||||||
{
|
|
||||||
op->retval = -ENOMEM;
|
|
||||||
FINISH_OP(op);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
stable[stable_count++] = {
|
|
||||||
.oid = clean_it->first,
|
|
||||||
.version = clean_it->second.version,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
stable[stable_count++] = {
|
||||||
|
.oid = clean_it->first,
|
||||||
|
.version = clean_it->second.version,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (first_shard != last_shard)
|
||||||
|
{
|
||||||
|
// If that's not a per-PG listing, sort clean entries
|
||||||
|
std::sort(stable, stable+stable_count);
|
||||||
|
}
|
||||||
int clean_stable_count = stable_count;
|
int clean_stable_count = stable_count;
|
||||||
// Copy dirty_db entries (sorted, too)
|
// Copy dirty_db entries (sorted, too)
|
||||||
int unstable_count = 0, unstable_alloc = 0;
|
int unstable_count = 0, unstable_alloc = 0;
|
||||||
@@ -508,7 +599,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
}
|
}
|
||||||
for (; dirty_it != dirty_end; dirty_it++)
|
for (; dirty_it != dirty_end; dirty_it++)
|
||||||
{
|
{
|
||||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
|
||||||
{
|
{
|
||||||
if (IS_DELETE(dirty_it->second.state))
|
if (IS_DELETE(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
|
@@ -55,9 +55,10 @@
|
|||||||
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
|
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
|
||||||
|
|
||||||
#define BS_SUBMIT_CHECK_SQES(n) \
|
#define BS_SUBMIT_CHECK_SQES(n) \
|
||||||
if (ringloop->space_left() < (n))\
|
if (ringloop->sqes_left() < (n))\
|
||||||
{\
|
{\
|
||||||
/* Pause until there are more requests available */\
|
/* Pause until there are more requests available */\
|
||||||
|
PRIV(op)->wait_detail = (n);\
|
||||||
PRIV(op)->wait_for = WAIT_SQE;\
|
PRIV(op)->wait_for = WAIT_SQE;\
|
||||||
return 0;\
|
return 0;\
|
||||||
}
|
}
|
||||||
@@ -71,6 +72,7 @@
|
|||||||
if (!sqe)\
|
if (!sqe)\
|
||||||
{\
|
{\
|
||||||
/* Pause until there are more requests available */\
|
/* Pause until there are more requests available */\
|
||||||
|
PRIV(op)->wait_detail = 1;\
|
||||||
PRIV(op)->wait_for = WAIT_SQE;\
|
PRIV(op)->wait_for = WAIT_SQE;\
|
||||||
return 0;\
|
return 0;\
|
||||||
}
|
}
|
||||||
@@ -80,6 +82,7 @@
|
|||||||
if (!sqe)\
|
if (!sqe)\
|
||||||
{\
|
{\
|
||||||
/* Pause until there are more requests available */\
|
/* Pause until there are more requests available */\
|
||||||
|
PRIV(op)->wait_detail = 1;\
|
||||||
PRIV(op)->wait_for = WAIT_SQE;\
|
PRIV(op)->wait_for = WAIT_SQE;\
|
||||||
return 0;\
|
return 0;\
|
||||||
}
|
}
|
||||||
@@ -201,6 +204,17 @@ typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
|
|||||||
|
|
||||||
#include "blockstore_flush.h"
|
#include "blockstore_flush.h"
|
||||||
|
|
||||||
|
typedef uint32_t pool_id_t;
|
||||||
|
typedef uint64_t pool_pg_id_t;
|
||||||
|
|
||||||
|
#define POOL_ID_BITS 16
|
||||||
|
|
||||||
|
struct pool_shard_settings_t
|
||||||
|
{
|
||||||
|
uint32_t pg_count;
|
||||||
|
uint32_t pg_stripe_size;
|
||||||
|
};
|
||||||
|
|
||||||
class blockstore_impl_t
|
class blockstore_impl_t
|
||||||
{
|
{
|
||||||
/******* OPTIONS *******/
|
/******* OPTIONS *******/
|
||||||
@@ -238,11 +252,14 @@ class blockstore_impl_t
|
|||||||
int throttle_target_parallelism = 1;
|
int throttle_target_parallelism = 1;
|
||||||
// Minimum difference in microseconds between target and real execution times to throttle the response
|
// Minimum difference in microseconds between target and real execution times to throttle the response
|
||||||
int throttle_threshold_us = 50;
|
int throttle_threshold_us = 50;
|
||||||
|
// Maximum number of LIST operations to be processed between
|
||||||
|
int single_tick_list_limit = 1;
|
||||||
/******* END OF OPTIONS *******/
|
/******* END OF OPTIONS *******/
|
||||||
|
|
||||||
struct ring_consumer_t ring_consumer;
|
struct ring_consumer_t ring_consumer;
|
||||||
|
|
||||||
blockstore_clean_db_t clean_db;
|
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||||
|
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||||
uint8_t *clean_bitmap = NULL;
|
uint8_t *clean_bitmap = NULL;
|
||||||
blockstore_dirty_db_t dirty_db;
|
blockstore_dirty_db_t dirty_db;
|
||||||
std::vector<blockstore_op_t*> submit_queue;
|
std::vector<blockstore_op_t*> submit_queue;
|
||||||
@@ -291,6 +308,9 @@ class blockstore_impl_t
|
|||||||
void open_journal();
|
void open_journal();
|
||||||
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||||
|
|
||||||
|
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||||
|
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||||
|
|
||||||
// Journaling
|
// Journaling
|
||||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||||
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
||||||
|
@@ -222,10 +222,11 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
|||||||
}
|
}
|
||||||
if (entry->oid.inode > 0)
|
if (entry->oid.inode > 0)
|
||||||
{
|
{
|
||||||
auto clean_it = bs->clean_db.find(entry->oid);
|
auto & clean_db = bs->clean_db_shard(entry->oid);
|
||||||
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
|
auto clean_it = clean_db.find(entry->oid);
|
||||||
|
if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
|
||||||
{
|
{
|
||||||
if (clean_it != bs->clean_db.end())
|
if (clean_it != clean_db.end())
|
||||||
{
|
{
|
||||||
// free the previous block
|
// free the previous block
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
@@ -245,7 +246,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
|||||||
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||||
#endif
|
#endif
|
||||||
bs->data_alloc->set(done_cnt+i, true);
|
bs->data_alloc->set(done_cnt+i, true);
|
||||||
bs->clean_db[entry->oid] = (struct clean_entry){
|
clean_db[entry->oid] = (struct clean_entry){
|
||||||
.version = entry->version,
|
.version = entry->version,
|
||||||
.location = (done_cnt+i) << block_order,
|
.location = (done_cnt+i) << block_order,
|
||||||
};
|
};
|
||||||
@@ -656,8 +657,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||||||
init_write_sector = proc_pos;
|
init_write_sector = proc_pos;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
auto clean_it = bs->clean_db.find(je->small_write.oid);
|
auto & clean_db = bs->clean_db_shard(je->small_write.oid);
|
||||||
if (clean_it == bs->clean_db.end() ||
|
auto clean_it = clean_db.find(je->small_write.oid);
|
||||||
|
if (clean_it == clean_db.end() ||
|
||||||
clean_it->second.version < je->small_write.version)
|
clean_it->second.version < je->small_write.version)
|
||||||
{
|
{
|
||||||
obj_ver_id ov = {
|
obj_ver_id ov = {
|
||||||
@@ -735,8 +737,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||||||
erase_dirty_object(dirty_it);
|
erase_dirty_object(dirty_it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
auto & clean_db = bs->clean_db_shard(je->big_write.oid);
|
||||||
if (clean_it == bs->clean_db.end() ||
|
auto clean_it = clean_db.find(je->big_write.oid);
|
||||||
|
if (clean_it == clean_db.end() ||
|
||||||
clean_it->second.version < je->big_write.version)
|
clean_it->second.version < je->big_write.version)
|
||||||
{
|
{
|
||||||
// oid, version, block
|
// oid, version, block
|
||||||
@@ -841,8 +844,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||||||
dirty_it--;
|
dirty_it--;
|
||||||
dirty_exists = dirty_it->first.oid == je->del.oid;
|
dirty_exists = dirty_it->first.oid == je->del.oid;
|
||||||
}
|
}
|
||||||
auto clean_it = bs->clean_db.find(je->del.oid);
|
auto & clean_db = bs->clean_db_shard(je->del.oid);
|
||||||
bool clean_exists = (clean_it != bs->clean_db.end() &&
|
auto clean_it = clean_db.find(je->del.oid);
|
||||||
|
bool clean_exists = (clean_it != clean_db.end() &&
|
||||||
clean_it->second.version < je->del.version);
|
clean_it->second.version < je->del.version);
|
||||||
if (!clean_exists && dirty_exists)
|
if (!clean_exists && dirty_exists)
|
||||||
{
|
{
|
||||||
@@ -901,8 +905,9 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto clean_it = bs->clean_db.find(oid);
|
auto & clean_db = bs->clean_db_shard(oid);
|
||||||
uint64_t clean_loc = clean_it != bs->clean_db.end()
|
auto clean_it = clean_db.find(oid);
|
||||||
|
uint64_t clean_loc = clean_it != clean_db.end()
|
||||||
? clean_it->second.location : UINT64_MAX;
|
? clean_it->second.location : UINT64_MAX;
|
||||||
if (exists && clean_loc == UINT64_MAX)
|
if (exists && clean_loc == UINT64_MAX)
|
||||||
{
|
{
|
||||||
|
@@ -111,6 +111,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
|
|||||||
|
|
||||||
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||||
{
|
{
|
||||||
|
auto & clean_db = clean_db_shard(read_op->oid);
|
||||||
auto clean_it = clean_db.find(read_op->oid);
|
auto clean_it = clean_db.find(read_op->oid);
|
||||||
auto dirty_it = dirty_db.upper_bound((obj_ver_id){
|
auto dirty_it = dirty_db.upper_bound((obj_ver_id){
|
||||||
.oid = read_op->oid,
|
.oid = read_op->oid,
|
||||||
@@ -297,6 +298,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
|
|||||||
dirty_it--;
|
dirty_it--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
auto & clean_db = clean_db_shard(oid);
|
||||||
auto clean_it = clean_db.find(oid);
|
auto clean_it = clean_db.find(oid);
|
||||||
if (clean_it != clean_db.end())
|
if (clean_it != clean_db.end())
|
||||||
{
|
{
|
||||||
|
@@ -54,6 +54,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
|||||||
auto dirty_it = dirty_db.find(*v);
|
auto dirty_it = dirty_db.find(*v);
|
||||||
if (dirty_it == dirty_db.end())
|
if (dirty_it == dirty_db.end())
|
||||||
{
|
{
|
||||||
|
auto & clean_db = clean_db_shard(v->oid);
|
||||||
auto clean_it = clean_db.find(v->oid);
|
auto clean_it = clean_db.find(v->oid);
|
||||||
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
||||||
{
|
{
|
||||||
@@ -188,6 +189,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
|||||||
}
|
}
|
||||||
if (exists == -1)
|
if (exists == -1)
|
||||||
{
|
{
|
||||||
|
auto & clean_db = clean_db_shard(v.oid);
|
||||||
auto clean_it = clean_db.find(v.oid);
|
auto clean_it = clean_db.find(v.oid);
|
||||||
exists = clean_it != clean_db.end() ? 1 : 0;
|
exists = clean_it != clean_db.end() ? 1 : 0;
|
||||||
}
|
}
|
||||||
@@ -215,6 +217,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
auto & clean_db = clean_db_shard(v.oid);
|
||||||
auto clean_it = clean_db.find(v.oid);
|
auto clean_it = clean_db.find(v.oid);
|
||||||
uint64_t clean_loc = clean_it != clean_db.end()
|
uint64_t clean_loc = clean_it != clean_db.end()
|
||||||
? clean_it->second.location : UINT64_MAX;
|
? clean_it->second.location : UINT64_MAX;
|
||||||
|
@@ -41,6 +41,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
|||||||
}
|
}
|
||||||
if (!found)
|
if (!found)
|
||||||
{
|
{
|
||||||
|
auto & clean_db = clean_db_shard(op->oid);
|
||||||
auto clean_it = clean_db.find(op->oid);
|
auto clean_it = clean_db.find(op->oid);
|
||||||
if (clean_it != clean_db.end())
|
if (clean_it != clean_db.end())
|
||||||
{
|
{
|
||||||
|
@@ -365,6 +365,13 @@ void cli_tool_t::run(json11::Json cfg)
|
|||||||
if (action_cb != NULL)
|
if (action_cb != NULL)
|
||||||
ringloop->wait();
|
ringloop->wait();
|
||||||
}
|
}
|
||||||
|
// Destroy the client
|
||||||
|
delete cli;
|
||||||
|
delete epmgr;
|
||||||
|
delete ringloop;
|
||||||
|
cli = NULL;
|
||||||
|
epmgr = NULL;
|
||||||
|
ringloop = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int narg, const char *args[])
|
int main(int narg, const char *args[])
|
||||||
@@ -374,5 +381,6 @@ int main(int narg, const char *args[])
|
|||||||
exe_name = args[0];
|
exe_name = args[0];
|
||||||
cli_tool_t *p = new cli_tool_t();
|
cli_tool_t *p = new cli_tool_t();
|
||||||
p->run(cli_tool_t::parse_args(narg, args));
|
p->run(cli_tool_t::parse_args(narg, args));
|
||||||
|
delete p;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@@ -108,9 +108,14 @@ resume_1:
|
|||||||
pool_avail = pg_free;
|
pool_avail = pg_free;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (pool_avail == UINT64_MAX)
|
||||||
|
{
|
||||||
|
pool_avail = 0;
|
||||||
|
}
|
||||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
pool_avail = pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
||||||
|
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
|
||||||
}
|
}
|
||||||
pool_stats[pool_cfg.id] = json11::Json::object {
|
pool_stats[pool_cfg.id] = json11::Json::object {
|
||||||
{ "name", pool_cfg.name },
|
{ "name", pool_cfg.name },
|
||||||
@@ -189,11 +194,16 @@ resume_1:
|
|||||||
json11::Json::array list;
|
json11::Json::array list;
|
||||||
for (auto & kv: pool_stats)
|
for (auto & kv: pool_stats)
|
||||||
{
|
{
|
||||||
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
|
double raw_to = kv.second["raw_to_usable"].number_value();
|
||||||
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
|
if (raw_to < 0.000001 && raw_to > -0.000001)
|
||||||
|
raw_to = 1;
|
||||||
|
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
|
||||||
|
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
|
||||||
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
|
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
|
||||||
kv.second["used_pct"] = format_q(100 - 100*kv.second["max_available"].uint64_value() *
|
kv.second["used_pct"] = format_q(kv.second["total_raw"].uint64_value()
|
||||||
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())+"%";
|
? (100 - 100*kv.second["max_available"].uint64_value() *
|
||||||
|
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())
|
||||||
|
: 100)+"%";
|
||||||
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
|
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
|
||||||
}
|
}
|
||||||
printf("%s", print_table(to_list(), cols, parent->color).c_str());
|
printf("%s", print_table(to_list(), cols, parent->color).c_str());
|
||||||
|
@@ -154,7 +154,7 @@ resume_1:
|
|||||||
if (pool_it != parent->cli->st_cli.pool_config.end())
|
if (pool_it != parent->cli->st_cli.pool_config.end())
|
||||||
{
|
{
|
||||||
auto & pool_cfg = pool_it->second;
|
auto & pool_cfg = pool_it->second;
|
||||||
used_size = used_size / pool_pg_real_size[pool_id]
|
used_size = used_size / (pool_pg_real_size[pool_id] ? pool_pg_real_size[pool_id] : 1)
|
||||||
* (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
* (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||||
}
|
}
|
||||||
auto stat_it = stats.find(inode_num);
|
auto stat_it = stats.find(inode_num);
|
||||||
|
683
src/cli_rm.cpp
683
src/cli_rm.cpp
@@ -1,211 +1,566 @@
|
|||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
#include "cli.h"
|
#include "cli.h"
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
|
#include "base64.h"
|
||||||
|
|
||||||
#define RM_LISTING 1
|
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||||
#define RM_REMOVING 2
|
//
|
||||||
#define RM_END 3
|
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||||
|
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||||
struct rm_pg_t
|
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||||
|
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||||
|
// is a read-only layer (snapshot) itself.
|
||||||
|
//
|
||||||
|
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||||
|
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||||
|
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||||
|
// "child" inode sizes.
|
||||||
|
//
|
||||||
|
// All other children of the chain are processed by iterating though them, merging removed
|
||||||
|
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||||
|
// chain.
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||||
|
// \ \ \- <child 2>
|
||||||
|
// \ \- <child 3>
|
||||||
|
// \-<child 4>
|
||||||
|
//
|
||||||
|
// 1) Find optimal pair for the "reverse" scenario
|
||||||
|
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||||
|
// 2) Process all children except <child 1>:
|
||||||
|
// - Merge <from>..<to> to <child 2>
|
||||||
|
// - Set <child 2> parent to <parent>
|
||||||
|
// - Repeat for others
|
||||||
|
// 3) Process <child 1>:
|
||||||
|
// - Merge <from>..<child 1> to <layer 2>
|
||||||
|
// - Set <layer 2> parent to <parent>
|
||||||
|
// - Rename <layer 2> to <child 1>
|
||||||
|
// 4) Delete other layers of the chain (<from>, <to>)
|
||||||
|
struct snap_remover_t
|
||||||
{
|
{
|
||||||
pg_num_t pg_num;
|
cli_tool_t *parent;
|
||||||
osd_num_t rm_osd_num;
|
|
||||||
std::set<object_id> objects;
|
// remove from..to
|
||||||
std::set<object_id>::iterator obj_pos;
|
std::string from_name, to_name;
|
||||||
uint64_t obj_count = 0, obj_done = 0;
|
// writers are stopped, we can safely change writable layers
|
||||||
|
bool writers_stopped = false;
|
||||||
|
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||||
|
int use_cas = 1;
|
||||||
|
// interval between fsyncs
|
||||||
|
int fsync_interval = 128;
|
||||||
|
|
||||||
|
std::map<inode_t,int> sources;
|
||||||
|
std::map<inode_t,uint64_t> inode_used;
|
||||||
|
std::vector<inode_t> merge_children;
|
||||||
|
std::vector<inode_t> chain_list;
|
||||||
|
std::map<inode_t,int> inverse_candidates;
|
||||||
|
inode_t inverse_parent = 0, inverse_child = 0;
|
||||||
|
inode_t new_parent = 0;
|
||||||
int state = 0;
|
int state = 0;
|
||||||
int in_flight = 0;
|
int current_child = 0;
|
||||||
};
|
std::function<bool(void)> cb;
|
||||||
|
|
||||||
struct rm_inode_t
|
bool is_done()
|
||||||
{
|
|
||||||
uint64_t inode = 0;
|
|
||||||
pool_id_t pool_id = 0;
|
|
||||||
uint64_t min_offset = 0;
|
|
||||||
|
|
||||||
cli_tool_t *parent = NULL;
|
|
||||||
inode_list_t *lister = NULL;
|
|
||||||
std::vector<rm_pg_t*> lists;
|
|
||||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
|
||||||
uint64_t pgs_to_list = 0;
|
|
||||||
bool lists_done = false;
|
|
||||||
int state = 0;
|
|
||||||
|
|
||||||
void start_delete()
|
|
||||||
{
|
{
|
||||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
return state == 9;
|
||||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
}
|
||||||
|
|
||||||
|
void loop()
|
||||||
|
{
|
||||||
|
if (state == 1)
|
||||||
|
goto resume_1;
|
||||||
|
else if (state == 2)
|
||||||
|
goto resume_2;
|
||||||
|
else if (state == 3)
|
||||||
|
goto resume_3;
|
||||||
|
else if (state == 4)
|
||||||
|
goto resume_4;
|
||||||
|
else if (state == 5)
|
||||||
|
goto resume_5;
|
||||||
|
else if (state == 6)
|
||||||
|
goto resume_6;
|
||||||
|
else if (state == 7)
|
||||||
|
goto resume_7;
|
||||||
|
else if (state == 8)
|
||||||
|
goto resume_8;
|
||||||
|
else if (state == 9)
|
||||||
|
goto resume_9;
|
||||||
|
// Get children to merge
|
||||||
|
get_merge_children();
|
||||||
|
// Try to select an inode for the "inverse" optimized scenario
|
||||||
|
// Read statistics from etcd to do it
|
||||||
|
read_stats();
|
||||||
|
state = 1;
|
||||||
|
resume_1:
|
||||||
|
if (parent->waiting > 0)
|
||||||
|
return;
|
||||||
|
choose_inverse_candidate();
|
||||||
|
// Merge children one by one, except our "inverse" child
|
||||||
|
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||||
{
|
{
|
||||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
if (merge_children[current_child] == inverse_child)
|
||||||
.pg_num = pg_num,
|
continue;
|
||||||
.rm_osd_num = primary_osd,
|
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||||
.objects = objects,
|
resume_2:
|
||||||
.obj_count = objects.size(),
|
while (!cb())
|
||||||
.obj_done = 0,
|
|
||||||
});
|
|
||||||
if (min_offset == 0)
|
|
||||||
{
|
{
|
||||||
total_count += objects.size();
|
state = 2;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
else
|
cb = NULL;
|
||||||
{
|
parent->change_parent(merge_children[current_child], new_parent);
|
||||||
for (object_id oid: objects)
|
state = 3;
|
||||||
{
|
resume_3:
|
||||||
if (oid.stripe >= min_offset)
|
if (parent->waiting > 0)
|
||||||
{
|
return;
|
||||||
total_count++;
|
}
|
||||||
}
|
// Merge our "inverse" child into our "inverse" parent
|
||||||
}
|
if (inverse_child != 0)
|
||||||
}
|
|
||||||
rm->obj_pos = rm->objects.begin();
|
|
||||||
lists.push_back(rm);
|
|
||||||
if (parent->list_first)
|
|
||||||
{
|
|
||||||
parent->cli->list_inode_next(lister, 1);
|
|
||||||
}
|
|
||||||
if (status & INODE_LIST_DONE)
|
|
||||||
{
|
|
||||||
lists_done = true;
|
|
||||||
}
|
|
||||||
pgs_to_list--;
|
|
||||||
continue_delete();
|
|
||||||
});
|
|
||||||
if (!lister)
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
|
start_merge_child(inverse_child, inverse_parent);
|
||||||
|
resume_4:
|
||||||
|
while (!cb())
|
||||||
|
{
|
||||||
|
state = 4;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cb = NULL;
|
||||||
|
// Delete "inverse" child data
|
||||||
|
start_delete_source(inverse_child);
|
||||||
|
resume_5:
|
||||||
|
while (!cb())
|
||||||
|
{
|
||||||
|
state = 5;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cb = NULL;
|
||||||
|
// Delete "inverse" child metadata, rename parent over it,
|
||||||
|
// and also change parent links of the previous "inverse" child
|
||||||
|
rename_inverse_parent();
|
||||||
|
state = 6;
|
||||||
|
resume_6:
|
||||||
|
if (parent->waiting > 0)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Delete parents, except the "inverse" one
|
||||||
|
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||||
|
{
|
||||||
|
if (chain_list[current_child] == inverse_parent)
|
||||||
|
continue;
|
||||||
|
start_delete_source(chain_list[current_child]);
|
||||||
|
resume_7:
|
||||||
|
while (!cb())
|
||||||
|
{
|
||||||
|
state = 7;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cb = NULL;
|
||||||
|
delete_inode_config(chain_list[current_child]);
|
||||||
|
state = 8;
|
||||||
|
resume_8:
|
||||||
|
if (parent->waiting > 0)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
state = 9;
|
||||||
|
resume_9:
|
||||||
|
// Done
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_merge_children()
|
||||||
|
{
|
||||||
|
// Get all children of from..to
|
||||||
|
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||||
|
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||||
|
// Check that to_cfg is actually a child of from_cfg
|
||||||
|
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||||
|
inode_config_t *cur = to_cfg;
|
||||||
|
chain_list.push_back(cur->num);
|
||||||
|
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||||
|
{
|
||||||
|
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||||
|
if (it == parent->cli->st_cli.inode_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
cur = &it->second;
|
||||||
|
chain_list.push_back(cur->num);
|
||||||
|
}
|
||||||
|
if (cur->num != from_cfg->num)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
new_parent = from_cfg->parent_id;
|
||||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
// Calculate ranks
|
||||||
}
|
int i = chain_list.size()-1;
|
||||||
|
for (inode_t item: chain_list)
|
||||||
void send_ops(rm_pg_t *cur_list)
|
|
||||||
{
|
|
||||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
|
||||||
parent->cli->msgr.osd_peer_fds.end())
|
|
||||||
{
|
{
|
||||||
// Initiate connection
|
sources[item] = i--;
|
||||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||||
{
|
{
|
||||||
if (cur_list->obj_pos->stripe >= min_offset)
|
if (!ic.second.parent_id)
|
||||||
{
|
{
|
||||||
osd_op_t *op = new osd_op_t();
|
continue;
|
||||||
op->op_type = OSD_OP_OUT;
|
|
||||||
op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
|
||||||
op->req = (osd_any_op_t){
|
|
||||||
.rw = {
|
|
||||||
.header = {
|
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
|
||||||
.id = parent->cli->next_op_id(),
|
|
||||||
.opcode = OSD_OP_DELETE,
|
|
||||||
},
|
|
||||||
.inode = cur_list->obj_pos->inode,
|
|
||||||
.offset = cur_list->obj_pos->stripe,
|
|
||||||
.len = 0,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
op->callback = [this, cur_list](osd_op_t *op)
|
|
||||||
{
|
|
||||||
cur_list->in_flight--;
|
|
||||||
if (op->reply.hdr.retval < 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
|
||||||
op->req.rw.inode, op->req.rw.offset,
|
|
||||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
|
||||||
}
|
|
||||||
delete op;
|
|
||||||
cur_list->obj_done++;
|
|
||||||
total_done++;
|
|
||||||
continue_delete();
|
|
||||||
};
|
|
||||||
cur_list->in_flight++;
|
|
||||||
parent->cli->msgr.outbox_push(op);
|
|
||||||
}
|
}
|
||||||
cur_list->obj_pos++;
|
auto it = sources.find(ic.second.parent_id);
|
||||||
}
|
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||||
}
|
|
||||||
|
|
||||||
void continue_delete()
|
|
||||||
{
|
|
||||||
if (parent->list_first && !lists_done)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < lists.size(); i++)
|
|
||||||
{
|
|
||||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
|
||||||
{
|
{
|
||||||
delete lists[i];
|
merge_children.push_back(ic.second.num);
|
||||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
if (ic.second.readonly || writers_stopped)
|
||||||
i--;
|
|
||||||
if (!lists_done)
|
|
||||||
{
|
{
|
||||||
parent->cli->list_inode_next(lister, 1);
|
inverse_candidates[ic.second.num] = it->second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
send_ops(lists[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
|
||||||
{
|
|
||||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
|
||||||
total_prev_pct = total_done*1000/total_count;
|
|
||||||
}
|
|
||||||
if (lists_done && !lists.size())
|
|
||||||
{
|
|
||||||
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
|
|
||||||
state = 2;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool loop()
|
void read_stats()
|
||||||
{
|
{
|
||||||
if (state == 0)
|
if (inverse_candidates.size() == 0)
|
||||||
{
|
{
|
||||||
start_delete();
|
return;
|
||||||
state = 1;
|
|
||||||
}
|
}
|
||||||
else if (state == 1)
|
json11::Json::array reads;
|
||||||
|
for (auto cp: inverse_candidates)
|
||||||
{
|
{
|
||||||
continue_delete();
|
inode_t inode = cp.first;
|
||||||
|
reads.push_back(json11::Json::object {
|
||||||
|
{ "request_range", json11::Json::object {
|
||||||
|
{ "key", base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+
|
||||||
|
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||||
|
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||||
|
) },
|
||||||
|
} }
|
||||||
|
});
|
||||||
}
|
}
|
||||||
else if (state == 2)
|
for (auto cp: sources)
|
||||||
{
|
{
|
||||||
return true;
|
inode_t inode = cp.first;
|
||||||
|
reads.push_back(json11::Json::object {
|
||||||
|
{ "request_range", json11::Json::object {
|
||||||
|
{ "key", base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+
|
||||||
|
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||||
|
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||||
|
) },
|
||||||
|
} }
|
||||||
|
});
|
||||||
}
|
}
|
||||||
return false;
|
parent->waiting++;
|
||||||
|
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||||
|
{ "success", reads },
|
||||||
|
}, [this](std::string err, json11::Json data)
|
||||||
|
{
|
||||||
|
parent->waiting--;
|
||||||
|
if (err != "")
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
for (auto inode_result: data["responses"].array_items())
|
||||||
|
{
|
||||||
|
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||||
|
pool_id_t pool_id = 0;
|
||||||
|
inode_t inode = 0;
|
||||||
|
char null_byte = 0;
|
||||||
|
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||||
|
if (!inode || null_byte != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||||
|
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
inode = INODE_WITH_POOL(pool_id, inode);
|
||||||
|
auto & pool_cfg = pool_cfg_it->second;
|
||||||
|
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||||
|
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||||
|
{
|
||||||
|
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||||
|
}
|
||||||
|
inode_used[inode] = used_bytes;
|
||||||
|
}
|
||||||
|
parent->ringloop->wakeup();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void choose_inverse_candidate()
|
||||||
|
{
|
||||||
|
uint64_t max_diff = 0;
|
||||||
|
for (auto cp: inverse_candidates)
|
||||||
|
{
|
||||||
|
inode_t child = cp.first;
|
||||||
|
uint64_t child_used = inode_used[child];
|
||||||
|
int rank = cp.second;
|
||||||
|
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||||
|
{
|
||||||
|
inode_t parent = chain_list[i];
|
||||||
|
uint64_t parent_used = inode_used[parent];
|
||||||
|
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||||
|
{
|
||||||
|
max_diff = (parent_used-child_used);
|
||||||
|
inverse_parent = parent;
|
||||||
|
inverse_child = child;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void rename_inverse_parent()
|
||||||
|
{
|
||||||
|
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||||
|
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||||
|
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
inode_config_t *child_cfg = &child_it->second;
|
||||||
|
inode_config_t *target_cfg = &target_it->second;
|
||||||
|
std::string child_name = child_cfg->name;
|
||||||
|
std::string target_name = target_cfg->name;
|
||||||
|
std::string child_cfg_key = base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+
|
||||||
|
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||||
|
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||||
|
);
|
||||||
|
std::string target_cfg_key = base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+
|
||||||
|
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||||
|
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||||
|
);
|
||||||
|
// Fill new configuration
|
||||||
|
inode_config_t new_cfg = *child_cfg;
|
||||||
|
new_cfg.num = target_cfg->num;
|
||||||
|
new_cfg.parent_id = new_parent;
|
||||||
|
json11::Json::array cmp = json11::Json::array {
|
||||||
|
json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", child_cfg_key },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||||
|
},
|
||||||
|
json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", target_cfg_key },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
json11::Json::array txn = json11::Json::array {
|
||||||
|
json11::Json::object {
|
||||||
|
{ "request_delete_range", json11::Json::object {
|
||||||
|
{ "key", child_cfg_key },
|
||||||
|
} },
|
||||||
|
},
|
||||||
|
json11::Json::object {
|
||||||
|
{ "request_put", json11::Json::object {
|
||||||
|
{ "key", target_cfg_key },
|
||||||
|
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||||
|
} },
|
||||||
|
},
|
||||||
|
json11::Json::object {
|
||||||
|
{ "request_put", json11::Json::object {
|
||||||
|
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||||
|
{ "value", base64_encode(json11::Json({
|
||||||
|
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||||
|
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||||
|
}).dump()) },
|
||||||
|
} },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
// Reparent children of inverse_child
|
||||||
|
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||||
|
{
|
||||||
|
if (cp.second.parent_id == child_cfg->num)
|
||||||
|
{
|
||||||
|
auto cp_cfg = cp.second;
|
||||||
|
cp_cfg.parent_id = inverse_parent;
|
||||||
|
auto cp_key = base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+
|
||||||
|
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||||
|
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||||
|
);
|
||||||
|
cmp.push_back(json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", cp_key },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", cp.second.mod_revision+1 },
|
||||||
|
});
|
||||||
|
txn.push_back(json11::Json::object {
|
||||||
|
{ "request_put", json11::Json::object {
|
||||||
|
{ "key", cp_key },
|
||||||
|
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||||
|
} },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parent->waiting++;
|
||||||
|
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||||
|
{ "compare", cmp },
|
||||||
|
{ "success", txn },
|
||||||
|
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||||
|
{
|
||||||
|
parent->waiting--;
|
||||||
|
if (err != "")
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (!res["succeeded"].bool_value())
|
||||||
|
{
|
||||||
|
fprintf(
|
||||||
|
stderr, "Parent (%s), child (%s), or one of its children"
|
||||||
|
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
|
||||||
|
);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||||
|
parent->ringloop->wakeup();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void delete_inode_config(inode_t cur)
|
||||||
|
{
|
||||||
|
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||||
|
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||||
|
std::string cur_name = cur_cfg->name;
|
||||||
|
std::string cur_cfg_key = base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+
|
||||||
|
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||||
|
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||||
|
);
|
||||||
|
parent->waiting++;
|
||||||
|
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||||
|
{ "compare", json11::Json::array {
|
||||||
|
json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", cur_cfg_key },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||||
|
},
|
||||||
|
} },
|
||||||
|
{ "success", json11::Json::array {
|
||||||
|
json11::Json::object {
|
||||||
|
{ "request_delete_range", json11::Json::object {
|
||||||
|
{ "key", cur_cfg_key },
|
||||||
|
} },
|
||||||
|
},
|
||||||
|
json11::Json::object {
|
||||||
|
{ "request_delete_range", json11::Json::object {
|
||||||
|
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||||
|
} },
|
||||||
|
},
|
||||||
|
} },
|
||||||
|
}, [this, cur_name](std::string err, json11::Json res)
|
||||||
|
{
|
||||||
|
parent->waiting--;
|
||||||
|
if (err != "")
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (!res["succeeded"].bool_value())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
printf("Layer %s deleted\n", cur_name.c_str());
|
||||||
|
parent->ringloop->wakeup();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||||
|
{
|
||||||
|
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||||
|
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||||
|
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
cb = parent->start_merge(json11::Json::object {
|
||||||
|
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
|
||||||
|
{ "target", target_it->second.name },
|
||||||
|
{ "delete-source", false },
|
||||||
|
{ "cas", use_cas },
|
||||||
|
{ "fsync-interval", fsync_interval },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void start_delete_source(inode_t inode)
|
||||||
|
{
|
||||||
|
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||||
|
if (source == parent->cli->st_cli.inode_config.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
cb = parent->start_rm(json11::Json::object {
|
||||||
|
{ "inode", inode },
|
||||||
|
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||||
|
{ "fsync-interval", fsync_interval },
|
||||||
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
|
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
||||||
{
|
{
|
||||||
auto remover = new rm_inode_t();
|
json11::Json::array cmd = cfg["command"].array_items();
|
||||||
remover->parent = this;
|
auto snap_remover = new snap_remover_t();
|
||||||
remover->inode = cfg["inode"].uint64_value();
|
snap_remover->parent = this;
|
||||||
remover->pool_id = cfg["pool"].uint64_value();
|
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||||
if (remover->pool_id)
|
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||||
|
if (snap_remover->from_name == "")
|
||||||
{
|
{
|
||||||
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
fprintf(stderr, "Layer to remove argument is missing\n");
|
||||||
}
|
|
||||||
remover->pool_id = INODE_POOL(remover->inode);
|
|
||||||
if (!remover->pool_id)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "pool is missing\n");
|
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
if (snap_remover->to_name == "")
|
||||||
return [remover]()
|
|
||||||
{
|
{
|
||||||
if (remover->loop())
|
snap_remover->to_name = snap_remover->from_name;
|
||||||
|
}
|
||||||
|
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||||
|
if (!snap_remover->fsync_interval)
|
||||||
|
snap_remover->fsync_interval = 128;
|
||||||
|
if (!cfg["cas"].is_null())
|
||||||
|
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||||
|
if (!cfg["writers_stopped"].is_null())
|
||||||
|
snap_remover->writers_stopped = true;
|
||||||
|
return [snap_remover]()
|
||||||
|
{
|
||||||
|
snap_remover->loop();
|
||||||
|
if (snap_remover->is_done())
|
||||||
{
|
{
|
||||||
delete remover;
|
delete snap_remover;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
214
src/cli_rm_data.cpp
Normal file
214
src/cli_rm_data.cpp
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
|
#include "cli.h"
|
||||||
|
#include "cluster_client.h"
|
||||||
|
|
||||||
|
#define RM_LISTING 1
|
||||||
|
#define RM_REMOVING 2
|
||||||
|
#define RM_END 3
|
||||||
|
|
||||||
|
struct rm_pg_t
|
||||||
|
{
|
||||||
|
pg_num_t pg_num;
|
||||||
|
osd_num_t rm_osd_num;
|
||||||
|
std::set<object_id> objects;
|
||||||
|
std::set<object_id>::iterator obj_pos;
|
||||||
|
uint64_t obj_count = 0, obj_done = 0;
|
||||||
|
int state = 0;
|
||||||
|
int in_flight = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rm_inode_t
|
||||||
|
{
|
||||||
|
uint64_t inode = 0;
|
||||||
|
pool_id_t pool_id = 0;
|
||||||
|
uint64_t min_offset = 0;
|
||||||
|
|
||||||
|
cli_tool_t *parent = NULL;
|
||||||
|
inode_list_t *lister = NULL;
|
||||||
|
std::vector<rm_pg_t*> lists;
|
||||||
|
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||||
|
uint64_t pgs_to_list = 0;
|
||||||
|
bool lists_done = false;
|
||||||
|
int state = 0;
|
||||||
|
|
||||||
|
void start_delete()
|
||||||
|
{
|
||||||
|
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||||
|
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||||
|
{
|
||||||
|
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||||
|
.pg_num = pg_num,
|
||||||
|
.rm_osd_num = primary_osd,
|
||||||
|
.objects = objects,
|
||||||
|
.obj_count = objects.size(),
|
||||||
|
.obj_done = 0,
|
||||||
|
});
|
||||||
|
if (min_offset == 0)
|
||||||
|
{
|
||||||
|
total_count += objects.size();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (object_id oid: objects)
|
||||||
|
{
|
||||||
|
if (oid.stripe >= min_offset)
|
||||||
|
{
|
||||||
|
total_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rm->obj_pos = rm->objects.begin();
|
||||||
|
lists.push_back(rm);
|
||||||
|
if (parent->list_first)
|
||||||
|
{
|
||||||
|
parent->cli->list_inode_next(lister, 1);
|
||||||
|
}
|
||||||
|
if (status & INODE_LIST_DONE)
|
||||||
|
{
|
||||||
|
lists_done = true;
|
||||||
|
}
|
||||||
|
pgs_to_list--;
|
||||||
|
continue_delete();
|
||||||
|
});
|
||||||
|
if (!lister)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||||
|
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||||
|
}
|
||||||
|
|
||||||
|
void send_ops(rm_pg_t *cur_list)
|
||||||
|
{
|
||||||
|
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||||
|
parent->cli->msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
// Initiate connection
|
||||||
|
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||||
|
{
|
||||||
|
if (cur_list->obj_pos->stripe >= min_offset)
|
||||||
|
{
|
||||||
|
osd_op_t *op = new osd_op_t();
|
||||||
|
op->op_type = OSD_OP_OUT;
|
||||||
|
// Already checked that it exists above, but anyway
|
||||||
|
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
|
||||||
|
op->req = (osd_any_op_t){
|
||||||
|
.rw = {
|
||||||
|
.header = {
|
||||||
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
|
.id = parent->cli->next_op_id(),
|
||||||
|
.opcode = OSD_OP_DELETE,
|
||||||
|
},
|
||||||
|
.inode = cur_list->obj_pos->inode,
|
||||||
|
.offset = cur_list->obj_pos->stripe,
|
||||||
|
.len = 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
op->callback = [this, cur_list](osd_op_t *op)
|
||||||
|
{
|
||||||
|
cur_list->in_flight--;
|
||||||
|
if (op->reply.hdr.retval < 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||||
|
op->req.rw.inode, op->req.rw.offset,
|
||||||
|
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||||
|
}
|
||||||
|
delete op;
|
||||||
|
cur_list->obj_done++;
|
||||||
|
total_done++;
|
||||||
|
continue_delete();
|
||||||
|
};
|
||||||
|
cur_list->in_flight++;
|
||||||
|
parent->cli->msgr.outbox_push(op);
|
||||||
|
}
|
||||||
|
cur_list->obj_pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void continue_delete()
|
||||||
|
{
|
||||||
|
if (parent->list_first && !lists_done)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < lists.size(); i++)
|
||||||
|
{
|
||||||
|
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||||
|
{
|
||||||
|
delete lists[i];
|
||||||
|
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||||
|
i--;
|
||||||
|
if (!lists_done)
|
||||||
|
{
|
||||||
|
parent->cli->list_inode_next(lister, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
send_ops(lists[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||||
|
{
|
||||||
|
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||||
|
total_prev_pct = total_done*1000/total_count;
|
||||||
|
}
|
||||||
|
if (lists_done && !lists.size())
|
||||||
|
{
|
||||||
|
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
|
||||||
|
state = 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool loop()
|
||||||
|
{
|
||||||
|
if (state == 0)
|
||||||
|
{
|
||||||
|
start_delete();
|
||||||
|
state = 1;
|
||||||
|
}
|
||||||
|
else if (state == 1)
|
||||||
|
{
|
||||||
|
continue_delete();
|
||||||
|
}
|
||||||
|
else if (state == 2)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
|
||||||
|
{
|
||||||
|
auto remover = new rm_inode_t();
|
||||||
|
remover->parent = this;
|
||||||
|
remover->inode = cfg["inode"].uint64_value();
|
||||||
|
remover->pool_id = cfg["pool"].uint64_value();
|
||||||
|
if (remover->pool_id)
|
||||||
|
{
|
||||||
|
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||||
|
}
|
||||||
|
remover->pool_id = INODE_POOL(remover->inode);
|
||||||
|
if (!remover->pool_id)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "pool is missing\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||||
|
return [remover]()
|
||||||
|
{
|
||||||
|
if (remover->loop())
|
||||||
|
{
|
||||||
|
delete remover;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
}
|
@@ -1,568 +0,0 @@
|
|||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include "cli.h"
|
|
||||||
#include "cluster_client.h"
|
|
||||||
#include "base64.h"
|
|
||||||
|
|
||||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
|
||||||
//
|
|
||||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
|
||||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
|
||||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
|
||||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
|
||||||
// is a read-only layer (snapshot) itself.
|
|
||||||
//
|
|
||||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
|
||||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
|
||||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
|
||||||
// "child" inode sizes.
|
|
||||||
//
|
|
||||||
// All other children of the chain are processed by iterating though them, merging removed
|
|
||||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
|
||||||
// chain.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
//
|
|
||||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
|
||||||
// \ \ \- <child 2>
|
|
||||||
// \ \- <child 3>
|
|
||||||
// \-<child 4>
|
|
||||||
//
|
|
||||||
// 1) Find optimal pair for the "reverse" scenario
|
|
||||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
|
||||||
// 2) Process all children except <child 1>:
|
|
||||||
// - Merge <from>..<to> to <child 2>
|
|
||||||
// - Set <child 2> parent to <parent>
|
|
||||||
// - Repeat for others
|
|
||||||
// 3) Process <child 1>:
|
|
||||||
// - Merge <from>..<child 1> to <layer 2>
|
|
||||||
// - Set <layer 2> parent to <parent>
|
|
||||||
// - Rename <layer 2> to <child 1>
|
|
||||||
// 4) Delete other layers of the chain (<from>, <to>)
|
|
||||||
struct snap_remover_t
|
|
||||||
{
|
|
||||||
cli_tool_t *parent;
|
|
||||||
|
|
||||||
// remove from..to
|
|
||||||
std::string from_name, to_name;
|
|
||||||
// writers are stopped, we can safely change writable layers
|
|
||||||
bool writers_stopped = false;
|
|
||||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
|
||||||
int use_cas = 1;
|
|
||||||
// interval between fsyncs
|
|
||||||
int fsync_interval = 128;
|
|
||||||
|
|
||||||
std::map<inode_t,int> sources;
|
|
||||||
std::map<inode_t,uint64_t> inode_used;
|
|
||||||
std::vector<inode_t> merge_children;
|
|
||||||
std::vector<inode_t> chain_list;
|
|
||||||
std::map<inode_t,int> inverse_candidates;
|
|
||||||
inode_t inverse_parent = 0, inverse_child = 0;
|
|
||||||
inode_t new_parent = 0;
|
|
||||||
int state = 0;
|
|
||||||
int current_child = 0;
|
|
||||||
std::function<bool(void)> cb;
|
|
||||||
|
|
||||||
bool is_done()
|
|
||||||
{
|
|
||||||
return state == 9;
|
|
||||||
}
|
|
||||||
|
|
||||||
void loop()
|
|
||||||
{
|
|
||||||
if (state == 1)
|
|
||||||
goto resume_1;
|
|
||||||
else if (state == 2)
|
|
||||||
goto resume_2;
|
|
||||||
else if (state == 3)
|
|
||||||
goto resume_3;
|
|
||||||
else if (state == 4)
|
|
||||||
goto resume_4;
|
|
||||||
else if (state == 5)
|
|
||||||
goto resume_5;
|
|
||||||
else if (state == 6)
|
|
||||||
goto resume_6;
|
|
||||||
else if (state == 7)
|
|
||||||
goto resume_7;
|
|
||||||
else if (state == 8)
|
|
||||||
goto resume_8;
|
|
||||||
else if (state == 9)
|
|
||||||
goto resume_9;
|
|
||||||
// Get children to merge
|
|
||||||
get_merge_children();
|
|
||||||
// Try to select an inode for the "inverse" optimized scenario
|
|
||||||
// Read statistics from etcd to do it
|
|
||||||
read_stats();
|
|
||||||
state = 1;
|
|
||||||
resume_1:
|
|
||||||
if (parent->waiting > 0)
|
|
||||||
return;
|
|
||||||
choose_inverse_candidate();
|
|
||||||
// Merge children one by one, except our "inverse" child
|
|
||||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
|
||||||
{
|
|
||||||
if (merge_children[current_child] == inverse_child)
|
|
||||||
continue;
|
|
||||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
|
||||||
resume_2:
|
|
||||||
while (!cb())
|
|
||||||
{
|
|
||||||
state = 2;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
cb = NULL;
|
|
||||||
parent->change_parent(merge_children[current_child], new_parent);
|
|
||||||
state = 3;
|
|
||||||
resume_3:
|
|
||||||
if (parent->waiting > 0)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Merge our "inverse" child into our "inverse" parent
|
|
||||||
if (inverse_child != 0)
|
|
||||||
{
|
|
||||||
start_merge_child(inverse_child, inverse_parent);
|
|
||||||
resume_4:
|
|
||||||
while (!cb())
|
|
||||||
{
|
|
||||||
state = 4;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
cb = NULL;
|
|
||||||
// Delete "inverse" child data
|
|
||||||
start_delete_source(inverse_child);
|
|
||||||
resume_5:
|
|
||||||
while (!cb())
|
|
||||||
{
|
|
||||||
state = 5;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
cb = NULL;
|
|
||||||
// Delete "inverse" child metadata, rename parent over it,
|
|
||||||
// and also change parent links of the previous "inverse" child
|
|
||||||
rename_inverse_parent();
|
|
||||||
state = 6;
|
|
||||||
resume_6:
|
|
||||||
if (parent->waiting > 0)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Delete parents, except the "inverse" one
|
|
||||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
|
||||||
{
|
|
||||||
if (chain_list[current_child] == inverse_parent)
|
|
||||||
continue;
|
|
||||||
start_delete_source(chain_list[current_child]);
|
|
||||||
resume_7:
|
|
||||||
while (!cb())
|
|
||||||
{
|
|
||||||
state = 7;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
cb = NULL;
|
|
||||||
delete_inode_config(chain_list[current_child]);
|
|
||||||
state = 8;
|
|
||||||
resume_8:
|
|
||||||
if (parent->waiting > 0)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
state = 9;
|
|
||||||
resume_9:
|
|
||||||
// Done
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
void get_merge_children()
|
|
||||||
{
|
|
||||||
// Get all children of from..to
|
|
||||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
|
||||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
|
||||||
// Check that to_cfg is actually a child of from_cfg
|
|
||||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
|
||||||
inode_config_t *cur = to_cfg;
|
|
||||||
chain_list.push_back(cur->num);
|
|
||||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
|
||||||
{
|
|
||||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
|
||||||
if (it == parent->cli->st_cli.inode_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
cur = &it->second;
|
|
||||||
chain_list.push_back(cur->num);
|
|
||||||
}
|
|
||||||
if (cur->num != from_cfg->num)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
new_parent = from_cfg->parent_id;
|
|
||||||
// Calculate ranks
|
|
||||||
int i = chain_list.size()-1;
|
|
||||||
for (inode_t item: chain_list)
|
|
||||||
{
|
|
||||||
sources[item] = i--;
|
|
||||||
}
|
|
||||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
|
||||||
{
|
|
||||||
if (!ic.second.parent_id)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
auto it = sources.find(ic.second.parent_id);
|
|
||||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
|
||||||
{
|
|
||||||
merge_children.push_back(ic.second.num);
|
|
||||||
if (ic.second.readonly || writers_stopped)
|
|
||||||
{
|
|
||||||
inverse_candidates[ic.second.num] = it->second;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void read_stats()
|
|
||||||
{
|
|
||||||
if (inverse_candidates.size() == 0)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
json11::Json::array reads;
|
|
||||||
for (auto cp: inverse_candidates)
|
|
||||||
{
|
|
||||||
inode_t inode = cp.first;
|
|
||||||
reads.push_back(json11::Json::object {
|
|
||||||
{ "request_range", json11::Json::object {
|
|
||||||
{ "key", base64_encode(
|
|
||||||
parent->cli->st_cli.etcd_prefix+
|
|
||||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
|
||||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
|
||||||
) },
|
|
||||||
} }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
for (auto cp: sources)
|
|
||||||
{
|
|
||||||
inode_t inode = cp.first;
|
|
||||||
reads.push_back(json11::Json::object {
|
|
||||||
{ "request_range", json11::Json::object {
|
|
||||||
{ "key", base64_encode(
|
|
||||||
parent->cli->st_cli.etcd_prefix+
|
|
||||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
|
||||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
|
||||||
) },
|
|
||||||
} }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
parent->waiting++;
|
|
||||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
|
||||||
{ "success", reads },
|
|
||||||
}, [this](std::string err, json11::Json data)
|
|
||||||
{
|
|
||||||
parent->waiting--;
|
|
||||||
if (err != "")
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
for (auto inode_result: data["responses"].array_items())
|
|
||||||
{
|
|
||||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
|
||||||
pool_id_t pool_id = 0;
|
|
||||||
inode_t inode = 0;
|
|
||||||
char null_byte = 0;
|
|
||||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
|
||||||
if (!inode || null_byte != 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
|
||||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
inode = INODE_WITH_POOL(pool_id, inode);
|
|
||||||
auto & pool_cfg = pool_cfg_it->second;
|
|
||||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
|
||||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
|
||||||
{
|
|
||||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
|
||||||
}
|
|
||||||
inode_used[inode] = used_bytes;
|
|
||||||
}
|
|
||||||
parent->ringloop->wakeup();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void choose_inverse_candidate()
|
|
||||||
{
|
|
||||||
uint64_t max_diff = 0;
|
|
||||||
for (auto cp: inverse_candidates)
|
|
||||||
{
|
|
||||||
inode_t child = cp.first;
|
|
||||||
uint64_t child_used = inode_used[child];
|
|
||||||
int rank = cp.second;
|
|
||||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
|
||||||
{
|
|
||||||
inode_t parent = chain_list[i];
|
|
||||||
uint64_t parent_used = inode_used[parent];
|
|
||||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
|
||||||
{
|
|
||||||
max_diff = (parent_used-child_used);
|
|
||||||
inverse_parent = parent;
|
|
||||||
inverse_child = child;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void rename_inverse_parent()
|
|
||||||
{
|
|
||||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
|
||||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
|
||||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
inode_config_t *child_cfg = &child_it->second;
|
|
||||||
inode_config_t *target_cfg = &target_it->second;
|
|
||||||
std::string child_name = child_cfg->name;
|
|
||||||
std::string target_name = target_cfg->name;
|
|
||||||
std::string child_cfg_key = base64_encode(
|
|
||||||
parent->cli->st_cli.etcd_prefix+
|
|
||||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
|
||||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
|
||||||
);
|
|
||||||
std::string target_cfg_key = base64_encode(
|
|
||||||
parent->cli->st_cli.etcd_prefix+
|
|
||||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
|
||||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
|
||||||
);
|
|
||||||
// Fill new configuration
|
|
||||||
inode_config_t new_cfg = *child_cfg;
|
|
||||||
new_cfg.num = target_cfg->num;
|
|
||||||
new_cfg.parent_id = new_parent;
|
|
||||||
json11::Json::array cmp = json11::Json::array {
|
|
||||||
json11::Json::object {
|
|
||||||
{ "target", "MOD" },
|
|
||||||
{ "key", child_cfg_key },
|
|
||||||
{ "result", "LESS" },
|
|
||||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
|
||||||
},
|
|
||||||
json11::Json::object {
|
|
||||||
{ "target", "MOD" },
|
|
||||||
{ "key", target_cfg_key },
|
|
||||||
{ "result", "LESS" },
|
|
||||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
|
||||||
},
|
|
||||||
};
|
|
||||||
json11::Json::array txn = json11::Json::array {
|
|
||||||
json11::Json::object {
|
|
||||||
{ "request_delete_range", json11::Json::object {
|
|
||||||
{ "key", child_cfg_key },
|
|
||||||
} },
|
|
||||||
},
|
|
||||||
json11::Json::object {
|
|
||||||
{ "request_put", json11::Json::object {
|
|
||||||
{ "key", target_cfg_key },
|
|
||||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
|
||||||
} },
|
|
||||||
},
|
|
||||||
json11::Json::object {
|
|
||||||
{ "request_put", json11::Json::object {
|
|
||||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
|
||||||
{ "value", base64_encode(json11::Json({
|
|
||||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
|
||||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
|
||||||
}).dump()) },
|
|
||||||
} },
|
|
||||||
},
|
|
||||||
};
|
|
||||||
// Reparent children of inverse_child
|
|
||||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
|
||||||
{
|
|
||||||
if (cp.second.parent_id == child_cfg->num)
|
|
||||||
{
|
|
||||||
auto cp_cfg = cp.second;
|
|
||||||
cp_cfg.parent_id = inverse_parent;
|
|
||||||
auto cp_key = base64_encode(
|
|
||||||
parent->cli->st_cli.etcd_prefix+
|
|
||||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
|
||||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
|
||||||
);
|
|
||||||
cmp.push_back(json11::Json::object {
|
|
||||||
{ "target", "MOD" },
|
|
||||||
{ "key", cp_key },
|
|
||||||
{ "result", "LESS" },
|
|
||||||
{ "mod_revision", cp.second.mod_revision+1 },
|
|
||||||
});
|
|
||||||
txn.push_back(json11::Json::object {
|
|
||||||
{ "request_put", json11::Json::object {
|
|
||||||
{ "key", cp_key },
|
|
||||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
|
||||||
} },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
parent->waiting++;
|
|
||||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
|
||||||
{ "compare", cmp },
|
|
||||||
{ "success", txn },
|
|
||||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
|
||||||
{
|
|
||||||
parent->waiting--;
|
|
||||||
if (err != "")
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (!res["succeeded"].bool_value())
|
|
||||||
{
|
|
||||||
fprintf(
|
|
||||||
stderr, "Parent (%s), child (%s), or one of its children"
|
|
||||||
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
|
|
||||||
);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
|
||||||
parent->ringloop->wakeup();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void delete_inode_config(inode_t cur)
|
|
||||||
{
|
|
||||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
|
||||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
|
||||||
std::string cur_name = cur_cfg->name;
|
|
||||||
std::string cur_cfg_key = base64_encode(
|
|
||||||
parent->cli->st_cli.etcd_prefix+
|
|
||||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
|
||||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
|
||||||
);
|
|
||||||
parent->waiting++;
|
|
||||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
|
||||||
{ "compare", json11::Json::array {
|
|
||||||
json11::Json::object {
|
|
||||||
{ "target", "MOD" },
|
|
||||||
{ "key", cur_cfg_key },
|
|
||||||
{ "result", "LESS" },
|
|
||||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
|
||||||
},
|
|
||||||
} },
|
|
||||||
{ "success", json11::Json::array {
|
|
||||||
json11::Json::object {
|
|
||||||
{ "request_delete_range", json11::Json::object {
|
|
||||||
{ "key", cur_cfg_key },
|
|
||||||
} },
|
|
||||||
},
|
|
||||||
json11::Json::object {
|
|
||||||
{ "request_delete_range", json11::Json::object {
|
|
||||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
|
||||||
} },
|
|
||||||
},
|
|
||||||
} },
|
|
||||||
}, [this, cur_name](std::string err, json11::Json res)
|
|
||||||
{
|
|
||||||
parent->waiting--;
|
|
||||||
if (err != "")
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (!res["succeeded"].bool_value())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
printf("Layer %s deleted\n", cur_name.c_str());
|
|
||||||
parent->ringloop->wakeup();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
|
||||||
{
|
|
||||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
|
||||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
|
||||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
cb = parent->start_merge(json11::Json::object {
|
|
||||||
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
|
|
||||||
{ "target", target_it->second.name },
|
|
||||||
{ "delete-source", false },
|
|
||||||
{ "cas", use_cas },
|
|
||||||
{ "fsync-interval", fsync_interval },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void start_delete_source(inode_t inode)
|
|
||||||
{
|
|
||||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
|
||||||
if (source == parent->cli->st_cli.inode_config.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
cb = parent->start_rm(json11::Json::object {
|
|
||||||
{ "inode", inode },
|
|
||||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
|
||||||
{ "fsync-interval", fsync_interval },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
|
||||||
{
|
|
||||||
json11::Json::array cmd = cfg["command"].array_items();
|
|
||||||
auto snap_remover = new snap_remover_t();
|
|
||||||
snap_remover->parent = this;
|
|
||||||
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
|
||||||
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
|
||||||
if (snap_remover->from_name == "")
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Layer to remove argument is missing\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (snap_remover->to_name == "")
|
|
||||||
{
|
|
||||||
snap_remover->to_name = snap_remover->from_name;
|
|
||||||
}
|
|
||||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
|
||||||
if (!snap_remover->fsync_interval)
|
|
||||||
snap_remover->fsync_interval = 128;
|
|
||||||
if (!cfg["cas"].is_null())
|
|
||||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
|
||||||
if (!cfg["writers_stopped"].is_null())
|
|
||||||
snap_remover->writers_stopped = true;
|
|
||||||
return [snap_remover]()
|
|
||||||
{
|
|
||||||
snap_remover->loop();
|
|
||||||
if (snap_remover->is_done())
|
|
||||||
{
|
|
||||||
delete snap_remover;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
}
|
|
@@ -143,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
|||||||
}
|
}
|
||||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||||
{
|
{
|
||||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||||
{
|
{
|
||||||
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
|
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
|
||||||
{
|
{
|
||||||
@@ -151,7 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
|||||||
}
|
}
|
||||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||||
{
|
{
|
||||||
// Flushes are always in the beginning
|
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -172,6 +172,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
|||||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||||
{
|
{
|
||||||
next->prev_wait += inc;
|
next->prev_wait += inc;
|
||||||
|
assert(next->prev_wait >= 0);
|
||||||
if (!next->prev_wait)
|
if (!next->prev_wait)
|
||||||
{
|
{
|
||||||
if (next->opcode == OSD_OP_SYNC)
|
if (next->opcode == OSD_OP_SYNC)
|
||||||
@@ -191,6 +192,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
|||||||
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
|
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
|
||||||
{
|
{
|
||||||
next->prev_wait += inc;
|
next->prev_wait += inc;
|
||||||
|
assert(next->prev_wait >= 0);
|
||||||
if (!next->prev_wait)
|
if (!next->prev_wait)
|
||||||
{
|
{
|
||||||
if (next->opcode == OSD_OP_SYNC)
|
if (next->opcode == OSD_OP_SYNC)
|
||||||
|
@@ -200,7 +200,8 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
|||||||
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
|
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num];
|
// Already checked that it exists above, but anyway
|
||||||
|
op->peer_fd = msgr.osd_peer_fds.at(cur_list->osd_num);
|
||||||
op->req = (osd_any_op_t){
|
op->req = (osd_any_op_t){
|
||||||
.sec_list = {
|
.sec_list = {
|
||||||
.header = {
|
.header = {
|
||||||
|
@@ -13,6 +13,7 @@
|
|||||||
epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
|
epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
|
||||||
{
|
{
|
||||||
this->ringloop = ringloop;
|
this->ringloop = ringloop;
|
||||||
|
this->pending = false;
|
||||||
|
|
||||||
epoll_fd = epoll_create(1);
|
epoll_fd = epoll_create(1);
|
||||||
if (epoll_fd < 0)
|
if (epoll_fd < 0)
|
||||||
@@ -22,11 +23,19 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
|
|||||||
|
|
||||||
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
|
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
|
||||||
|
|
||||||
|
consumer.loop = [this]()
|
||||||
|
{
|
||||||
|
if (pending)
|
||||||
|
handle_epoll_events();
|
||||||
|
};
|
||||||
|
ringloop->register_consumer(&consumer);
|
||||||
|
|
||||||
handle_epoll_events();
|
handle_epoll_events();
|
||||||
}
|
}
|
||||||
|
|
||||||
epoll_manager_t::~epoll_manager_t()
|
epoll_manager_t::~epoll_manager_t()
|
||||||
{
|
{
|
||||||
|
ringloop->unregister_consumer(&consumer);
|
||||||
if (tfd)
|
if (tfd)
|
||||||
{
|
{
|
||||||
delete tfd;
|
delete tfd;
|
||||||
@@ -64,8 +73,13 @@ void epoll_manager_t::handle_epoll_events()
|
|||||||
io_uring_sqe *sqe = ringloop->get_sqe();
|
io_uring_sqe *sqe = ringloop->get_sqe();
|
||||||
if (!sqe)
|
if (!sqe)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
|
// Don't handle epoll events until we manage to post the next event handler
|
||||||
|
// otherwise we'll fall out of sync with EPOLLET
|
||||||
|
pending = true;
|
||||||
|
ringloop->wakeup();
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
pending = false;
|
||||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||||
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
||||||
data->callback = [this](ring_data_t *data)
|
data->callback = [this](ring_data_t *data)
|
||||||
|
@@ -11,6 +11,8 @@
|
|||||||
class epoll_manager_t
|
class epoll_manager_t
|
||||||
{
|
{
|
||||||
int epoll_fd;
|
int epoll_fd;
|
||||||
|
bool pending;
|
||||||
|
ring_consumer_t consumer;
|
||||||
ring_loop_t *ringloop;
|
ring_loop_t *ringloop;
|
||||||
std::map<int, std::function<void(int, int)>> epoll_handlers;
|
std::map<int, std::function<void(int, int)>> epoll_handlers;
|
||||||
public:
|
public:
|
||||||
|
@@ -351,9 +351,9 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("+++ %s 0x%lx 0x%llx+%llx\n",
|
printf("+++ %s 0x%lx 0x%llx+%lx\n",
|
||||||
io->ddir == DDIR_READ ? "READ" : "WRITE",
|
io->ddir == DDIR_READ ? "READ" : "WRITE",
|
||||||
(uint64_t)io, io->offset, io->xfer_buflen);
|
(uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,4 +1,3 @@
|
|||||||
extern "C" {
|
|
||||||
// Kill atomics in fio headers
|
// Kill atomics in fio headers
|
||||||
#define _STDATOMIC_H
|
#define _STDATOMIC_H
|
||||||
#include "fio/arch/arch.h"
|
#include "fio/arch/arch.h"
|
||||||
@@ -11,6 +10,7 @@ extern "C" {
|
|||||||
#define CONFIG_HAVE_GETTID
|
#define CONFIG_HAVE_GETTID
|
||||||
#define CONFIG_SYNC_FILE_RANGE
|
#define CONFIG_SYNC_FILE_RANGE
|
||||||
#define CONFIG_PWRITEV2
|
#define CONFIG_PWRITEV2
|
||||||
|
extern "C" {
|
||||||
#include "fio/fio.h"
|
#include "fio/fio.h"
|
||||||
#include "fio/optgroup.h"
|
#include "fio/optgroup.h"
|
||||||
}
|
}
|
||||||
|
@@ -170,14 +170,14 @@ static int sec_init(struct thread_data *td)
|
|||||||
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
|
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
|
||||||
bsd->block_size = 1 << o->block_order;
|
bsd->block_size = 1 << o->block_order;
|
||||||
|
|
||||||
sockaddr addr;
|
sockaddr_storage addr;
|
||||||
if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr))
|
if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
|
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
bsd->connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
bsd->connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||||
if (bsd->connect_fd < 0)
|
if (bsd->connect_fd < 0)
|
||||||
{
|
{
|
||||||
perror("socket");
|
perror("socket");
|
||||||
@@ -192,11 +192,16 @@ static int sec_init(struct thread_data *td)
|
|||||||
setsockopt(bsd->connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
setsockopt(bsd->connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||||
if (o->zerocopy_send)
|
if (o->zerocopy_send)
|
||||||
{
|
{
|
||||||
|
#ifndef SO_ZEROCOPY
|
||||||
|
perror("zerocopy send not supported on your system (socket.h misses SO_ZEROCOPY)");
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
if (setsockopt(bsd->connect_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) < 0)
|
if (setsockopt(bsd->connect_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) < 0)
|
||||||
{
|
{
|
||||||
perror("setsockopt zerocopy");
|
perror("setsockopt zerocopy");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: read config (block size) from OSD
|
// FIXME: read config (block size) from OSD
|
||||||
@@ -306,7 +311,13 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
|||||||
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
||||||
wtotal += io->xfer_buflen;
|
wtotal += io->xfer_buflen;
|
||||||
}
|
}
|
||||||
if (sendv_blocking(bsd->connect_fd, iov, iovcnt, opt->zerocopy_send ? MSG_ZEROCOPY : 0) != wtotal)
|
if (sendv_blocking(bsd->connect_fd, iov, iovcnt,
|
||||||
|
#ifdef SO_ZEROCOPY
|
||||||
|
opt->zerocopy_send ? MSG_ZEROCOPY : 0
|
||||||
|
#else
|
||||||
|
0
|
||||||
|
#endif
|
||||||
|
) != wtotal)
|
||||||
{
|
{
|
||||||
perror("sendmsg");
|
perror("sendmsg");
|
||||||
exit(1);
|
exit(1);
|
||||||
@@ -344,7 +355,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
|||||||
{
|
{
|
||||||
if (reply.hdr.retval != io->xfer_buflen)
|
if (reply.hdr.retval != io->xfer_buflen)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Short read: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
|
fprintf(stderr, "Short read: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
// Support bitmap
|
// Support bitmap
|
||||||
@@ -369,7 +380,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
|||||||
{
|
{
|
||||||
if (reply.hdr.retval != io->xfer_buflen)
|
if (reply.hdr.retval != io->xfer_buflen)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Short write: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
|
fprintf(stderr, "Short write: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -62,9 +62,10 @@ struct http_co_t
|
|||||||
void run_cb_and_clear();
|
void run_cb_and_clear();
|
||||||
void start_connection();
|
void start_connection();
|
||||||
void close_connection();
|
void close_connection();
|
||||||
|
void next_request();
|
||||||
void handle_events();
|
void handle_events();
|
||||||
void handle_connect_result();
|
void handle_connect_result();
|
||||||
void submit_read();
|
void submit_read(bool check_timeout);
|
||||||
void submit_send();
|
void submit_send();
|
||||||
bool handle_read();
|
bool handle_read();
|
||||||
void post_message(int type, const std::string & msg);
|
void post_message(int type, const std::string & msg);
|
||||||
@@ -128,6 +129,7 @@ void http_co_t::run_cb_and_clear()
|
|||||||
// Call callback after clearing it because otherwise we may hit reenterability problems
|
// Call callback after clearing it because otherwise we may hit reenterability problems
|
||||||
if (cb != NULL)
|
if (cb != NULL)
|
||||||
cb(&parsed);
|
cb(&parsed);
|
||||||
|
next_request();
|
||||||
}
|
}
|
||||||
|
|
||||||
void http_co_t::send_request(const std::string & host, const std::string & request,
|
void http_co_t::send_request(const std::string & host, const std::string & request,
|
||||||
@@ -161,17 +163,6 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
|||||||
this->sent = 0;
|
this->sent = 0;
|
||||||
this->response_callback = response_callback;
|
this->response_callback = response_callback;
|
||||||
this->parsed = {};
|
this->parsed = {};
|
||||||
if (request_timeout > 0)
|
|
||||||
{
|
|
||||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
|
||||||
{
|
|
||||||
stackin();
|
|
||||||
close_connection();
|
|
||||||
parsed = { .error = "HTTP request timed out" };
|
|
||||||
run_cb_and_clear();
|
|
||||||
stackout();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (state == HTTP_CO_KEEPALIVE)
|
if (state == HTTP_CO_KEEPALIVE)
|
||||||
{
|
{
|
||||||
state = HTTP_CO_SENDING_REQUEST;
|
state = HTTP_CO_SENDING_REQUEST;
|
||||||
@@ -181,6 +172,28 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
|||||||
{
|
{
|
||||||
start_connection();
|
start_connection();
|
||||||
}
|
}
|
||||||
|
// Do it _after_ state assignment because set_timer() can actually trigger
|
||||||
|
// other timers and requests (reenterability is our friend)
|
||||||
|
if (request_timeout > 0)
|
||||||
|
{
|
||||||
|
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||||
|
{
|
||||||
|
stackin();
|
||||||
|
if (state == HTTP_CO_REQUEST_SENT)
|
||||||
|
{
|
||||||
|
// In case of high CPU load, we may not handle etcd responses in time
|
||||||
|
// For this case, first check the socket and only then terminate request with the timeout
|
||||||
|
submit_read(true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
close_connection();
|
||||||
|
parsed = { .error = "HTTP request timed out" };
|
||||||
|
run_cb_and_clear();
|
||||||
|
}
|
||||||
|
stackout();
|
||||||
|
});
|
||||||
|
}
|
||||||
stackout();
|
stackout();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -271,17 +284,19 @@ void http_co_t::close_connection()
|
|||||||
void http_co_t::start_connection()
|
void http_co_t::start_connection()
|
||||||
{
|
{
|
||||||
stackin();
|
stackin();
|
||||||
struct sockaddr addr;
|
struct sockaddr_storage addr;
|
||||||
if (!string_to_addr(host.c_str(), 1, 80, &addr))
|
if (!string_to_addr(host.c_str(), 1, 80, &addr))
|
||||||
{
|
{
|
||||||
|
close_connection();
|
||||||
parsed = { .error = "Invalid address: "+host };
|
parsed = { .error = "Invalid address: "+host };
|
||||||
run_cb_and_clear();
|
run_cb_and_clear();
|
||||||
stackout();
|
stackout();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||||
if (peer_fd < 0)
|
if (peer_fd < 0)
|
||||||
{
|
{
|
||||||
|
close_connection();
|
||||||
parsed = { .error = std::string("socket: ")+strerror(errno) };
|
parsed = { .error = std::string("socket: ")+strerror(errno) };
|
||||||
run_cb_and_clear();
|
run_cb_and_clear();
|
||||||
stackout();
|
stackout();
|
||||||
@@ -323,7 +338,7 @@ void http_co_t::handle_events()
|
|||||||
epoll_events &= ~EPOLLOUT;
|
epoll_events &= ~EPOLLOUT;
|
||||||
if (epoll_events & EPOLLIN)
|
if (epoll_events & EPOLLIN)
|
||||||
{
|
{
|
||||||
submit_read();
|
submit_read(false);
|
||||||
}
|
}
|
||||||
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
|
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
|
||||||
{
|
{
|
||||||
@@ -410,10 +425,11 @@ again:
|
|||||||
stackout();
|
stackout();
|
||||||
}
|
}
|
||||||
|
|
||||||
void http_co_t::submit_read()
|
void http_co_t::submit_read(bool check_timeout)
|
||||||
{
|
{
|
||||||
stackin();
|
stackin();
|
||||||
int res;
|
int res;
|
||||||
|
again:
|
||||||
if (rbuf.size() != READ_BUFFER_SIZE)
|
if (rbuf.size() != READ_BUFFER_SIZE)
|
||||||
{
|
{
|
||||||
rbuf.resize(READ_BUFFER_SIZE);
|
rbuf.resize(READ_BUFFER_SIZE);
|
||||||
@@ -428,7 +444,22 @@ void http_co_t::submit_read()
|
|||||||
}
|
}
|
||||||
if (res == -EAGAIN || res == -EINTR)
|
if (res == -EAGAIN || res == -EINTR)
|
||||||
{
|
{
|
||||||
epoll_events = epoll_events & ~EPOLLIN;
|
if (check_timeout)
|
||||||
|
{
|
||||||
|
if (res == -EINTR)
|
||||||
|
goto again;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Timeout happened and there is no data to read
|
||||||
|
close_connection();
|
||||||
|
parsed = { .error = "HTTP request timed out" };
|
||||||
|
run_cb_and_clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
epoll_events = epoll_events & ~EPOLLIN;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (res <= 0)
|
else if (res <= 0)
|
||||||
{
|
{
|
||||||
@@ -501,8 +532,11 @@ bool http_co_t::handle_read()
|
|||||||
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
|
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
|
||||||
{
|
{
|
||||||
std::swap(parsed.body, response);
|
std::swap(parsed.body, response);
|
||||||
response_callback(&parsed);
|
if (!keepalive)
|
||||||
parsed.eof = true;
|
close_connection();
|
||||||
|
else
|
||||||
|
state = HTTP_CO_KEEPALIVE;
|
||||||
|
run_cb_and_clear();
|
||||||
}
|
}
|
||||||
else if (state == HTTP_CO_CHUNKED && response.size() > 0)
|
else if (state == HTTP_CO_CHUNKED && response.size() > 0)
|
||||||
{
|
{
|
||||||
@@ -533,10 +567,14 @@ bool http_co_t::handle_read()
|
|||||||
response_callback(&parsed);
|
response_callback(&parsed);
|
||||||
parsed.body = "";
|
parsed.body = "";
|
||||||
}
|
}
|
||||||
if (parsed.eof && !want_streaming)
|
else if (parsed.eof)
|
||||||
{
|
{
|
||||||
// Normal response
|
// Normal response
|
||||||
response_callback(&parsed);
|
if (!keepalive)
|
||||||
|
close_connection();
|
||||||
|
else
|
||||||
|
state = HTTP_CO_KEEPALIVE;
|
||||||
|
run_cb_and_clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
|
else if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
|
||||||
@@ -547,29 +585,20 @@ bool http_co_t::handle_read()
|
|||||||
parsed.body = "";
|
parsed.body = "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (parsed.eof)
|
|
||||||
{
|
|
||||||
response_callback = NULL;
|
|
||||||
parsed = {};
|
|
||||||
if (!keepalive)
|
|
||||||
{
|
|
||||||
close_connection();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
state = HTTP_CO_KEEPALIVE;
|
|
||||||
if (keepalive_queue.size() > 0)
|
|
||||||
{
|
|
||||||
auto next = keepalive_queue[0];
|
|
||||||
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
|
||||||
next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stackout();
|
stackout();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void http_co_t::next_request()
|
||||||
|
{
|
||||||
|
if (keepalive_queue.size() > 0)
|
||||||
|
{
|
||||||
|
auto next = keepalive_queue[0];
|
||||||
|
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
||||||
|
next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t stoull_full(const std::string & str, int base)
|
uint64_t stoull_full(const std::string & str, int base)
|
||||||
{
|
{
|
||||||
if (isspace(str[0]))
|
if (isspace(str[0]))
|
||||||
|
@@ -222,13 +222,13 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
|
|||||||
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
|
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
|
||||||
{
|
{
|
||||||
assert(peer_osd != this->osd_num);
|
assert(peer_osd != this->osd_num);
|
||||||
struct sockaddr addr;
|
struct sockaddr_storage addr;
|
||||||
if (!string_to_addr(peer_host, 0, peer_port, &addr))
|
if (!string_to_addr(peer_host, 0, peer_port, &addr))
|
||||||
{
|
{
|
||||||
on_connect_peer(peer_osd, -EINVAL);
|
on_connect_peer(peer_osd, -EINVAL);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
int peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||||
if (peer_fd < 0)
|
if (peer_fd < 0)
|
||||||
{
|
{
|
||||||
on_connect_peer(peer_osd, -errno);
|
on_connect_peer(peer_osd, -errno);
|
||||||
@@ -484,10 +484,10 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||||||
void osd_messenger_t::accept_connections(int listen_fd)
|
void osd_messenger_t::accept_connections(int listen_fd)
|
||||||
{
|
{
|
||||||
// Accept new connections
|
// Accept new connections
|
||||||
sockaddr addr;
|
sockaddr_storage addr;
|
||||||
socklen_t peer_addr_size = sizeof(addr);
|
socklen_t peer_addr_size = sizeof(addr);
|
||||||
int peer_fd;
|
int peer_fd;
|
||||||
while ((peer_fd = accept(listen_fd, &addr, &peer_addr_size)) >= 0)
|
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
|
||||||
{
|
{
|
||||||
assert(peer_fd != 0);
|
assert(peer_fd != 0);
|
||||||
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
||||||
|
@@ -49,7 +49,7 @@ struct osd_client_t
|
|||||||
{
|
{
|
||||||
int refs = 0;
|
int refs = 0;
|
||||||
|
|
||||||
sockaddr peer_addr;
|
sockaddr_storage peer_addr;
|
||||||
int peer_port;
|
int peer_port;
|
||||||
int peer_fd;
|
int peer_fd;
|
||||||
int peer_state;
|
int peer_state;
|
||||||
|
@@ -111,6 +111,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
|||||||
{
|
{
|
||||||
delete cl->read_op;
|
delete cl->read_op;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cancel_op(cl->read_op);
|
||||||
|
}
|
||||||
cl->read_op = NULL;
|
cl->read_op = NULL;
|
||||||
}
|
}
|
||||||
if (cl->osd_num)
|
if (cl->osd_num)
|
||||||
|
@@ -55,6 +55,15 @@ protected:
|
|||||||
iovec read_iov = { 0 };
|
iovec read_iov = { 0 };
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
~nbd_proxy()
|
||||||
|
{
|
||||||
|
if (recv_buf)
|
||||||
|
{
|
||||||
|
free(recv_buf);
|
||||||
|
recv_buf = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static json11::Json::object parse_args(int narg, const char *args[])
|
static json11::Json::object parse_args(int narg, const char *args[])
|
||||||
{
|
{
|
||||||
json11::Json::object cfg;
|
json11::Json::object cfg;
|
||||||
@@ -322,6 +331,9 @@ public:
|
|||||||
delete cli;
|
delete cli;
|
||||||
delete epmgr;
|
delete epmgr;
|
||||||
delete ringloop;
|
delete ringloop;
|
||||||
|
cli = NULL;
|
||||||
|
epmgr = NULL;
|
||||||
|
ringloop = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_module()
|
void load_module()
|
||||||
@@ -351,7 +363,8 @@ public:
|
|||||||
setsid();
|
setsid();
|
||||||
if (fork())
|
if (fork())
|
||||||
exit(0);
|
exit(0);
|
||||||
chdir("/");
|
if (chdir("/") != 0)
|
||||||
|
fprintf(stderr, "Warning: Failed to chdir into /\n");
|
||||||
close(0);
|
close(0);
|
||||||
close(1);
|
close(1);
|
||||||
close(2);
|
close(2);
|
||||||
@@ -498,7 +511,7 @@ protected:
|
|||||||
goto end_unmap;
|
goto end_unmap;
|
||||||
}
|
}
|
||||||
ioctl(nbd, NBD_SET_FLAGS, flags);
|
ioctl(nbd, NBD_SET_FLAGS, flags);
|
||||||
if (timeout >= 0)
|
if (timeout > 0)
|
||||||
{
|
{
|
||||||
r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)timeout);
|
r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)timeout);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
@@ -513,7 +526,11 @@ protected:
|
|||||||
{
|
{
|
||||||
goto end_unmap;
|
goto end_unmap;
|
||||||
}
|
}
|
||||||
write(qd_fd, "32768", 5);
|
r = write(qd_fd, "32768", 5);
|
||||||
|
if (r != 5)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Warning: Failed to configure max_sectors_kb\n");
|
||||||
|
}
|
||||||
close(qd_fd);
|
close(qd_fd);
|
||||||
if (!fork())
|
if (!fork())
|
||||||
{
|
{
|
||||||
|
56
src/osd.cpp
56
src/osd.cpp
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
#include <sys/poll.h>
|
#include <sys/poll.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
#include <netinet/in.h>
|
#include <netinet/in.h>
|
||||||
#include <netinet/tcp.h>
|
#include <netinet/tcp.h>
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
@@ -53,6 +54,20 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
autosync_writes = max_autosync;
|
autosync_writes = max_autosync;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this->config["osd_memlock"] == "true" || this->config["osd_memlock"] == "1" || this->config["osd_memlock"] == "yes")
|
||||||
|
{
|
||||||
|
// Lock all OSD memory if requested
|
||||||
|
if (mlockall(MCL_CURRENT|MCL_FUTURE
|
||||||
|
#ifdef MCL_ONFAULT
|
||||||
|
| MCL_ONFAULT
|
||||||
|
#endif
|
||||||
|
) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "osd_memlock is set to true, but mlockall() failed: %s\n", strerror(errno));
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
print_stats();
|
print_stats();
|
||||||
@@ -185,46 +200,7 @@ void osd_t::bind_socket()
|
|||||||
|
|
||||||
// FIXME Support multiple listening sockets
|
// FIXME Support multiple listening sockets
|
||||||
|
|
||||||
sockaddr addr;
|
listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
|
||||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
|
||||||
{
|
|
||||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
|
||||||
}
|
|
||||||
|
|
||||||
listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
|
||||||
if (listen_fd < 0)
|
|
||||||
{
|
|
||||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
int enable = 1;
|
|
||||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
|
||||||
|
|
||||||
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
|
|
||||||
{
|
|
||||||
close(listen_fd);
|
|
||||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
if (bind_port == 0)
|
|
||||||
{
|
|
||||||
socklen_t len = sizeof(addr);
|
|
||||||
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
|
|
||||||
{
|
|
||||||
close(listen_fd);
|
|
||||||
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
listening_port = bind_port;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (listen(listen_fd, listen_backlog) < 0)
|
|
||||||
{
|
|
||||||
close(listen_fd);
|
|
||||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
|
|
||||||
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
||||||
|
@@ -211,7 +211,7 @@ class osd_t
|
|||||||
// flushing, recovery and backfill
|
// flushing, recovery and backfill
|
||||||
void submit_pg_flush_ops(pg_t & pg);
|
void submit_pg_flush_ops(pg_t & pg);
|
||||||
void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
|
void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
|
||||||
void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||||
bool pick_next_recovery(osd_recovery_op_t &op);
|
bool pick_next_recovery(osd_recovery_op_t &op);
|
||||||
void submit_recovery_op(osd_recovery_op_t *op);
|
void submit_recovery_op(osd_recovery_op_t *op);
|
||||||
bool continue_recovery();
|
bool continue_recovery();
|
||||||
|
@@ -457,7 +457,8 @@ void osd_t::renew_lease()
|
|||||||
if (err == "" && data["result"]["TTL"].string_value() == "")
|
if (err == "" && data["result"]["TTL"].string_value() == "")
|
||||||
{
|
{
|
||||||
// Die
|
// Die
|
||||||
throw std::runtime_error("etcd lease has expired");
|
fprintf(stderr, "Error refreshing etcd lease\n");
|
||||||
|
force_stop(1);
|
||||||
}
|
}
|
||||||
if (err != "")
|
if (err != "")
|
||||||
{
|
{
|
||||||
@@ -466,7 +467,8 @@ void osd_t::renew_lease()
|
|||||||
if (etcd_failed_attempts > st_cli.max_etcd_attempts)
|
if (etcd_failed_attempts > st_cli.max_etcd_attempts)
|
||||||
{
|
{
|
||||||
// Die
|
// Die
|
||||||
throw std::runtime_error("Cluster connection failed");
|
fprintf(stderr, "Cluster connection failed\n");
|
||||||
|
force_stop(1);
|
||||||
}
|
}
|
||||||
// Retry
|
// Retry
|
||||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
|
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
|
||||||
|
@@ -47,7 +47,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
|||||||
if (l.second.size() > 0)
|
if (l.second.size() > 0)
|
||||||
{
|
{
|
||||||
fb->flush_ops++;
|
fb->flush_ops++;
|
||||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
|
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data()))
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto & l: fb->stable_lists)
|
for (auto & l: fb->stable_lists)
|
||||||
@@ -55,7 +56,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
|||||||
if (l.second.size() > 0)
|
if (l.second.size() > 0)
|
||||||
{
|
{
|
||||||
fb->flush_ops++;
|
fb->flush_ops++;
|
||||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
|
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data()))
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -160,7 +162,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
||||||
{
|
{
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
// Copy buffer so it gets freed along with the operation
|
// Copy buffer so it gets freed along with the operation
|
||||||
@@ -188,10 +190,8 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Peer
|
// Peer
|
||||||
int peer_fd = msgr.osd_peer_fds[peer_osd];
|
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
||||||
op->peer_fd = peer_fd;
|
|
||||||
op->req = (osd_any_op_t){
|
op->req = (osd_any_op_t){
|
||||||
.sec_stab = {
|
.sec_stab = {
|
||||||
.header = {
|
.header = {
|
||||||
@@ -207,8 +207,21 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
msgr.outbox_push(op);
|
auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||||
|
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
op->peer_fd = peer_fd_it->second;
|
||||||
|
msgr.outbox_push(op);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Fail it immediately
|
||||||
|
op->reply.hdr.retval = -EPIPE;
|
||||||
|
op->callback(op);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||||
|
@@ -29,8 +29,10 @@ void osd_t::handle_peers()
|
|||||||
degraded_objects += p.second.degraded_objects.size();
|
degraded_objects += p.second.degraded_objects.size();
|
||||||
if (p.second.state & PG_HAS_UNCLEAN)
|
if (p.second.state & PG_HAS_UNCLEAN)
|
||||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||||
else if (p.second.state & PG_HAS_DEGRADED)
|
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||||
peering_state = peering_state | OSD_RECOVERING;
|
peering_state = peering_state | OSD_RECOVERING;
|
||||||
|
ringloop->wakeup();
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -340,7 +342,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Peer
|
// Peer
|
||||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
|
auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->peer_fd = cl->peer_fd;
|
op->peer_fd = cl->peer_fd;
|
||||||
@@ -394,7 +396,9 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||||||
{
|
{
|
||||||
if (op->bs_op->retval < 0)
|
if (op->bs_op->retval < 0)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("local OP_LIST failed");
|
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
|
||||||
|
force_stop(1);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
add_bs_subop_stats(op);
|
add_bs_subop_stats(op);
|
||||||
printf(
|
printf(
|
||||||
@@ -419,7 +423,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||||||
// Peer
|
// Peer
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->peer_fd = msgr.osd_peer_fds[role_osd];
|
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
||||||
op->req = (osd_any_op_t){
|
op->req = (osd_any_op_t){
|
||||||
.sec_list = {
|
.sec_list = {
|
||||||
.header = {
|
.header = {
|
||||||
|
@@ -246,7 +246,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
// Send to a remote OSD
|
// Send to a remote OSD
|
||||||
osd_op_t *subop = op_data->subops+subop_idx;
|
osd_op_t *subop = op_data->subops+subop_idx;
|
||||||
subop->op_type = OSD_OP_OUT;
|
subop->op_type = OSD_OP_OUT;
|
||||||
subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
|
|
||||||
// FIXME: Use the pre-allocated buffer
|
// FIXME: Use the pre-allocated buffer
|
||||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||||
subop->req = (osd_any_op_t){
|
subop->req = (osd_any_op_t){
|
||||||
@@ -287,7 +286,18 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
}
|
}
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
msgr.outbox_push(subop);
|
auto peer_fd_it = msgr.osd_peer_fds.find(subop_osd_num);
|
||||||
|
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
subop->peer_fd = peer_fd_it->second;
|
||||||
|
msgr.outbox_push(subop);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Fail it immediately
|
||||||
|
subop->reply.hdr.retval = -EPIPE;
|
||||||
|
subop->callback(subop);
|
||||||
|
}
|
||||||
subop_idx++;
|
subop_idx++;
|
||||||
}
|
}
|
||||||
prev = i+1;
|
prev = i+1;
|
||||||
|
@@ -182,7 +182,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
subop->op_type = OSD_OP_OUT;
|
subop->op_type = OSD_OP_OUT;
|
||||||
subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
|
|
||||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||||
subop->bitmap_len = clean_entry_bitmap_size;
|
subop->bitmap_len = clean_entry_bitmap_size;
|
||||||
subop->req.sec_rw = {
|
subop->req.sec_rw = {
|
||||||
@@ -225,7 +224,18 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
{
|
{
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
msgr.outbox_push(subop);
|
auto peer_fd_it = msgr.osd_peer_fds.find(role_osd_num);
|
||||||
|
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
subop->peer_fd = peer_fd_it->second;
|
||||||
|
msgr.outbox_push(subop);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Fail it immediately
|
||||||
|
subop->reply.hdr.retval = -EPIPE;
|
||||||
|
subop->callback(subop);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
@@ -463,7 +473,6 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
subops[i].op_type = OSD_OP_OUT;
|
subops[i].op_type = OSD_OP_OUT;
|
||||||
subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
|
|
||||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
@@ -477,7 +486,18 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
{
|
{
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
msgr.outbox_push(&subops[i]);
|
auto peer_fd_it = msgr.osd_peer_fds.find(chunk.osd_num);
|
||||||
|
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
subops[i].peer_fd = peer_fd_it->second;
|
||||||
|
msgr.outbox_push(&subops[i]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Fail it immediately
|
||||||
|
subops[i].reply.hdr.retval = -EPIPE;
|
||||||
|
subops[i].callback(&subops[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -567,7 +587,6 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
subops[i].op_type = OSD_OP_OUT;
|
subops[i].op_type = OSD_OP_OUT;
|
||||||
subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
|
|
||||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
@@ -581,7 +600,18 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
msgr.outbox_push(&subops[i]);
|
auto peer_fd_it = msgr.osd_peer_fds.find(stab_osd.osd_num);
|
||||||
|
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
subops[i].peer_fd = peer_fd_it->second;
|
||||||
|
msgr.outbox_push(&subops[i]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Fail it immediately
|
||||||
|
subops[i].reply.hdr.retval = -EPIPE;
|
||||||
|
subops[i].callback(&subops[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
#include "osd_id.h"
|
#include "osd_id.h"
|
||||||
|
|
||||||
#ifndef MEM_ALIGNMENT
|
#ifndef MEM_ALIGNMENT
|
||||||
#define MEM_ALIGNMENT 512
|
#define MEM_ALIGNMENT 4096
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct buf_len_t
|
struct buf_len_t
|
||||||
|
@@ -134,14 +134,14 @@ int main(int narg, char *args[])
|
|||||||
|
|
||||||
int connect_osd(const char *osd_address, int osd_port)
|
int connect_osd(const char *osd_address, int osd_port)
|
||||||
{
|
{
|
||||||
struct sockaddr addr;
|
struct sockaddr_storage addr;
|
||||||
if (!string_to_addr(osd_address, 0, osd_port, &addr))
|
if (!string_to_addr(osd_address, 0, osd_port, &addr))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "server address: %s is not valid\n", osd_address);
|
fprintf(stderr, "server address: %s is not valid\n", osd_address);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
int connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||||
if (connect_fd < 0)
|
if (connect_fd < 0)
|
||||||
{
|
{
|
||||||
perror("socket");
|
perror("socket");
|
||||||
|
@@ -112,3 +112,17 @@ void ring_loop_t::restore(unsigned sqe_tail)
|
|||||||
}
|
}
|
||||||
ring.sq.sqe_tail = sqe_tail;
|
ring.sq.sqe_tail = sqe_tail;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ring_loop_t::sqes_left()
|
||||||
|
{
|
||||||
|
struct io_uring_sq *sq = &ring.sq;
|
||||||
|
unsigned int head = io_uring_smp_load_acquire(sq->khead);
|
||||||
|
unsigned int next = sq->sqe_tail + 1;
|
||||||
|
int left = *sq->kring_entries - (next - head);
|
||||||
|
if (left > free_ring_data_ptr)
|
||||||
|
{
|
||||||
|
// return min(sqes left, ring_datas left)
|
||||||
|
return free_ring_data_ptr;
|
||||||
|
}
|
||||||
|
return left;
|
||||||
|
}
|
||||||
|
@@ -17,15 +17,12 @@
|
|||||||
|
|
||||||
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
||||||
{
|
{
|
||||||
sqe->opcode = op;
|
// Prepare a read/write operation without clearing user_data
|
||||||
sqe->flags = 0;
|
// Very recently, 22 Dec 2021, liburing finally got this change too (8ecd3fd959634df81d66af8b3a69c16202a014e8)
|
||||||
sqe->ioprio = 0;
|
// But all versions prior to it (sadly) clear user_data
|
||||||
sqe->fd = fd;
|
__u64 user_data = sqe->user_data;
|
||||||
sqe->off = offset;
|
io_uring_prep_rw(op, sqe, fd, addr, len, offset);
|
||||||
sqe->addr = (unsigned long) addr;
|
sqe->user_data = user_data;
|
||||||
sqe->len = len;
|
|
||||||
sqe->rw_flags = 0;
|
|
||||||
sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void my_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, unsigned nr_vecs, off_t offset)
|
static inline void my_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, unsigned nr_vecs, off_t offset)
|
||||||
@@ -172,6 +169,7 @@ public:
|
|||||||
struct io_uring_cqe *cqe;
|
struct io_uring_cqe *cqe;
|
||||||
return io_uring_wait_cqe(&ring, &cqe);
|
return io_uring_wait_cqe(&ring, &cqe);
|
||||||
}
|
}
|
||||||
|
int sqes_left();
|
||||||
inline unsigned space_left()
|
inline unsigned space_left()
|
||||||
{
|
{
|
||||||
return free_ring_data_ptr;
|
return free_ring_data_ptr;
|
||||||
|
@@ -67,14 +67,14 @@ int main(int narg, char *args[])
|
|||||||
|
|
||||||
int connect_stub(const char *server_address, int server_port)
|
int connect_stub(const char *server_address, int server_port)
|
||||||
{
|
{
|
||||||
struct sockaddr addr;
|
struct sockaddr_storage addr;
|
||||||
if (!string_to_addr(server_address, 0, server_port, &addr))
|
if (!string_to_addr(server_address, 0, server_port, &addr))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "server address: %s is not valid\n", server_address);
|
fprintf(stderr, "server address: %s is not valid\n", server_address);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
int connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||||
if (connect_fd < 0)
|
if (connect_fd < 0)
|
||||||
{
|
{
|
||||||
perror("socket");
|
perror("socket");
|
||||||
|
@@ -41,21 +41,19 @@
|
|||||||
#include "rw_blocking.h"
|
#include "rw_blocking.h"
|
||||||
#include "osd_ops.h"
|
#include "osd_ops.h"
|
||||||
|
|
||||||
int bind_stub(std::string bind_address, int bind_port);
|
|
||||||
|
|
||||||
void run_stub(int peer_fd);
|
void run_stub(int peer_fd);
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
int listen_fd = bind_stub("0.0.0.0", 11203);
|
int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
|
||||||
// Accept new connections
|
// Accept new connections
|
||||||
sockaddr addr;
|
sockaddr_storage addr;
|
||||||
socklen_t peer_addr_size = sizeof(addr);
|
socklen_t peer_addr_size = sizeof(addr);
|
||||||
int peer_fd;
|
int peer_fd;
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
printf("stub_osd: waiting for 1 client\n");
|
printf("stub_osd: waiting for 1 client\n");
|
||||||
peer_fd = accept(listen_fd, &addr, &peer_addr_size);
|
peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size);
|
||||||
if (peer_fd == -1)
|
if (peer_fd == -1)
|
||||||
{
|
{
|
||||||
if (errno == EAGAIN)
|
if (errno == EAGAIN)
|
||||||
@@ -76,39 +74,6 @@ int main(int narg, char *args[])
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int bind_stub(std::string bind_address, int bind_port)
|
|
||||||
{
|
|
||||||
int listen_backlog = 128;
|
|
||||||
|
|
||||||
sockaddr addr;
|
|
||||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
|
||||||
{
|
|
||||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
|
||||||
}
|
|
||||||
|
|
||||||
int listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
|
||||||
if (listen_fd < 0)
|
|
||||||
{
|
|
||||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
int enable = 1;
|
|
||||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
|
||||||
|
|
||||||
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
|
|
||||||
{
|
|
||||||
close(listen_fd);
|
|
||||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (listen(listen_fd, listen_backlog) < 0)
|
|
||||||
{
|
|
||||||
close(listen_fd);
|
|
||||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
return listen_fd;
|
|
||||||
}
|
|
||||||
|
|
||||||
void run_stub(int peer_fd)
|
void run_stub(int peer_fd)
|
||||||
{
|
{
|
||||||
osd_any_op_t op;
|
osd_any_op_t op;
|
||||||
|
@@ -25,8 +25,6 @@
|
|||||||
#include "epoll_manager.h"
|
#include "epoll_manager.h"
|
||||||
#include "messenger.h"
|
#include "messenger.h"
|
||||||
|
|
||||||
int bind_stub(std::string bind_address, int bind_port);
|
|
||||||
|
|
||||||
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
|
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
@@ -43,7 +41,8 @@ int main(int narg, char *args[])
|
|||||||
json11::Json config = json11::Json::object { { "log_level", 1 } };
|
json11::Json config = json11::Json::object { { "log_level", 1 } };
|
||||||
msgr->parse_config(config);
|
msgr->parse_config(config);
|
||||||
// Accept new connections
|
// Accept new connections
|
||||||
int listen_fd = bind_stub("0.0.0.0", 11203);
|
int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
|
||||||
|
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
epmgr->set_fd_handler(listen_fd, false, [listen_fd, msgr](int fd, int events)
|
epmgr->set_fd_handler(listen_fd, false, [listen_fd, msgr](int fd, int events)
|
||||||
{
|
{
|
||||||
msgr->accept_connections(listen_fd);
|
msgr->accept_connections(listen_fd);
|
||||||
@@ -67,41 +66,6 @@ int main(int narg, char *args[])
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int bind_stub(std::string bind_address, int bind_port)
|
|
||||||
{
|
|
||||||
int listen_backlog = 128;
|
|
||||||
|
|
||||||
sockaddr addr;
|
|
||||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
|
||||||
{
|
|
||||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
|
||||||
}
|
|
||||||
|
|
||||||
int listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
|
||||||
if (listen_fd < 0)
|
|
||||||
{
|
|
||||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
int enable = 1;
|
|
||||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
|
||||||
|
|
||||||
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
|
|
||||||
{
|
|
||||||
close(listen_fd);
|
|
||||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (listen(listen_fd, listen_backlog) < 0)
|
|
||||||
{
|
|
||||||
close(listen_fd);
|
|
||||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
|
||||||
|
|
||||||
return listen_fd;
|
|
||||||
}
|
|
||||||
|
|
||||||
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
|
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
|
||||||
{
|
{
|
||||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
|||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 0.6.12
|
Version: 0.6.15
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user