Compare commits
2 Commits
node-mutex
...
msgr-iothr
Author | SHA1 | Date | |
---|---|---|---|
249a233b37 | |||
d07e072212 |
@@ -16,7 +16,6 @@ env:
|
||||
BUILDENV_IMAGE: git.yourcmc.ru/vitalif/vitastor/buildenv
|
||||
TEST_IMAGE: git.yourcmc.ru/vitalif/vitastor/test
|
||||
OSD_ARGS: '--etcd_quick_timeout 2000'
|
||||
USE_RAMDISK: 1
|
||||
|
||||
concurrency:
|
||||
group: ci-${{ github.ref }}
|
||||
@@ -198,24 +197,6 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_etcd_fail_antietcd:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 10
|
||||
run: ANTIETCD=1 /root/vitastor/tests/test_etcd_fail.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_interrupted_rebalance:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
@@ -684,24 +665,6 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_heal_antietcd:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 10
|
||||
run: ANTIETCD=1 /root/vitastor/tests/test_heal.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_heal_csum_32k_dmj:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
@@ -34,10 +34,6 @@ for my $line (<>)
|
||||
{
|
||||
$test_name .= '_imm';
|
||||
}
|
||||
elsif ($1 eq 'ANTIETCD')
|
||||
{
|
||||
$test_name .= '_antietcd';
|
||||
}
|
||||
else
|
||||
{
|
||||
$test_name .= '_'.lc($1).'_'.$2;
|
||||
|
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VITASTOR_VERSION "1.7.1")
|
||||
set(VERSION "1.6.1")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -1,9 +1,9 @@
|
||||
VITASTOR_VERSION ?= v1.7.1
|
||||
VERSION ?= v1.6.1
|
||||
|
||||
all: build push
|
||||
|
||||
build:
|
||||
@docker build --rm -t vitalif/vitastor-csi:$(VITASTOR_VERSION) .
|
||||
@docker build --rm -t vitalif/vitastor-csi:$(VERSION) .
|
||||
|
||||
push:
|
||||
@docker push vitalif/vitastor-csi:$(VITASTOR_VERSION)
|
||||
@docker push vitalif/vitastor-csi:$(VERSION)
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v1.7.1
|
||||
image: vitalif/vitastor-csi:v1.6.1
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -121,7 +121,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v1.7.1
|
||||
image: vitalif/vitastor-csi:v1.6.1
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "1.7.1"
|
||||
vitastorCSIDriverVersion = "1.6.1"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (1.7.1-1) unstable; urgency=medium
|
||||
vitastor (1.6.1-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
2
debian/vitastor-mon.install
vendored
2
debian/vitastor-mon.install
vendored
@@ -1,3 +1,3 @@
|
||||
mon usr/lib/vitastor/
|
||||
mon usr/lib/vitastor/mon
|
||||
mon/scripts/make-etcd usr/lib/vitastor/mon
|
||||
mon/scripts/vitastor-mon.service /lib/systemd/system
|
||||
|
2
debian/vitastor-mon.postinst
vendored
2
debian/vitastor-mon.postinst
vendored
@@ -6,6 +6,4 @@ if [ "$1" = "configure" ]; then
|
||||
addgroup --system --quiet vitastor
|
||||
adduser --system --quiet --ingroup vitastor --no-create-home --home /nonexistent vitastor
|
||||
mkdir -p /etc/vitastor
|
||||
mkdir -p /var/lib/vitastor
|
||||
chown vitastor:vitastor /var/lib/vitastor
|
||||
fi
|
||||
|
33
debian/vitastor.Dockerfile
vendored
33
debian/vitastor.Dockerfile
vendored
@@ -9,12 +9,12 @@ ARG REL=
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
RUN set -e -x; \
|
||||
if [ "$REL" = "buster" ]; then \
|
||||
apt-get update; \
|
||||
apt-get -y install wget; \
|
||||
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
|
||||
echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
|
||||
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
|
||||
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
|
||||
echo >> /etc/apt/preferences; \
|
||||
echo 'Package: *' >> /etc/apt/preferences; \
|
||||
echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
|
||||
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
|
||||
fi; \
|
||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
|
||||
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
|
||||
@@ -22,9 +22,10 @@ RUN set -e -x; \
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl
|
||||
RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN apt-get -y build-dep fio
|
||||
RUN apt-get --download-only source fio
|
||||
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev
|
||||
|
||||
ADD . /root/vitastor
|
||||
RUN set -e -x; \
|
||||
@@ -36,10 +37,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
FULLVER=$(head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
VER=${FULLVER%%-*}; \
|
||||
cp -r /root/vitastor vitastor-$VER; \
|
||||
cd vitastor-$VER; \
|
||||
cp -r /root/vitastor vitastor-1.6.1; \
|
||||
cd vitastor-1.6.1; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -51,14 +50,10 @@ RUN set -e -x; \
|
||||
echo fio-headers.patch >> debian/patches/series; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL/vitastor-$VER; \
|
||||
mkdir mon/node_modules; \
|
||||
cd mon/node_modules; \
|
||||
curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx; \
|
||||
curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER; \
|
||||
cd vitastor-$VER; \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.6.1.orig.tar.xz vitastor-1.6.1; \
|
||||
cd vitastor-1.6.1; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
rm -rf /root/packages/vitastor-$REL/vitastor-*/
|
||||
|
@@ -9,7 +9,6 @@
|
||||
These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
|
||||
affect their interaction with the cluster.
|
||||
|
||||
- [client_iothread_count](#client_iothread_count)
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_retry_enospc](#client_retry_enospc)
|
||||
@@ -24,23 +23,6 @@ affect their interaction with the cluster.
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||
|
||||
## client_iothread_count
|
||||
|
||||
- Type: integer
|
||||
- Default: 0
|
||||
|
||||
Number of separate threads for handling TCP network I/O at client library
|
||||
side. Enabling 4 threads usually allows to increase peak performance of each
|
||||
client from approx. 2-3 to 7-8 GByte/s linear read/write and from approx.
|
||||
100-150 to 400 thousand iops, but at the same time it increases latency.
|
||||
Latency increase depends on CPU: with CPU power saving disabled latency
|
||||
only increases by ~10 us (equivalent to Q=1 iops decrease from 10500 to 9500),
|
||||
with CPU power saving enabled it may be as high as 500 us (equivalent to Q=1
|
||||
iops decrease from 2000 to 1000). RDMA isn't affected by this option.
|
||||
|
||||
It's recommended to enable client I/O threads if you don't use RDMA and want
|
||||
to increase peak client performance.
|
||||
|
||||
## client_retry_interval
|
||||
|
||||
- Type: milliseconds
|
||||
|
@@ -9,7 +9,6 @@
|
||||
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
|
||||
затрагивают логику их работы с кластером.
|
||||
|
||||
- [client_iothread_count](#client_iothread_count)
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_retry_enospc](#client_retry_enospc)
|
||||
@@ -24,24 +23,6 @@
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||
|
||||
## client_iothread_count
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 0
|
||||
|
||||
Число отдельных потоков для обработки ввода-вывода через TCP сеть на стороне
|
||||
клиентской библиотеки. Включение 4 потоков обычно позволяет поднять пиковую
|
||||
производительность каждого клиента примерно с 2-3 до 7-8 Гбайт/с линейного
|
||||
чтения/записи и примерно с 100-150 до 400 тысяч операций ввода-вывода в
|
||||
секунду, но ухудшает задержку. Увеличение задержки зависит от процессора:
|
||||
при отключённом энергосбережении CPU это всего ~10 микросекунд (равносильно
|
||||
падению iops с Q=1 с 10500 до 9500), а при включённом это может быть
|
||||
и 500 микросекунд (равносильно падению iops с Q=1 с 2000 до 1000). На работу
|
||||
RDMA данная опция не влияет.
|
||||
|
||||
Рекомендуется включать клиентские потоки ввода-вывода, если вы не используете
|
||||
RDMA и хотите повысить пиковую производительность клиентов.
|
||||
|
||||
## client_retry_interval
|
||||
|
||||
- Тип: миллисекунды
|
||||
|
@@ -56,24 +56,14 @@ Can't be smaller than the OSD data device sector.
|
||||
## immediate_commit
|
||||
|
||||
- Type: string
|
||||
- Default: all
|
||||
- Default: false
|
||||
|
||||
One of "none", "all" or "small". Global value, may be overriden [at pool level](pool.en.md#immediate_commit).
|
||||
|
||||
This parameter is also really important for performance.
|
||||
|
||||
TLDR: default "all" is optimal for server-grade SSDs with supercapacitor-based
|
||||
power loss protection (nonvolatile write-through cache) and also for most HDDs.
|
||||
"none" or "small" should be only selected if you use desktop SSDs without
|
||||
capacitors or drives with slow write-back cache that can't be disabled. Check
|
||||
immediate_commit of your OSDs in [ls-osd](../usage/cli.en.md#ls-osd).
|
||||
|
||||
Detailed explanation:
|
||||
Another parameter which is really important for performance.
|
||||
|
||||
Desktop SSDs are very fast (100000+ iops) for simple random writes
|
||||
without cache flush. However, they are really slow (only around 1000 iops)
|
||||
if you try to fsync() each write, that is, if you want to guarantee that
|
||||
each change gets actually persisted to the physical media.
|
||||
if you try to fsync() each write, that is, when you want to guarantee that
|
||||
each change gets immediately persisted to the physical media.
|
||||
|
||||
Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
|
||||
"Supercapacitor-based Power Loss Protection", on the other hand, are equally
|
||||
@@ -85,8 +75,8 @@ really slow when used with desktop SSDs. Vitastor, however, can also
|
||||
efficiently utilize desktop SSDs by postponing fsync until the client calls
|
||||
it explicitly.
|
||||
|
||||
This is what this parameter regulates. When it's set to "all" Vitastor
|
||||
cluster commits each change to disks immediately and clients just
|
||||
This is what this parameter regulates. When it's set to "all" the whole
|
||||
Vitastor cluster commits each change to disks immediately and clients just
|
||||
ignore fsyncs because they know for sure that they're unneeded. This reduces
|
||||
the amount of network roundtrips performed by clients and improves
|
||||
performance. So it's always better to use server grade SSDs with
|
||||
@@ -109,5 +99,9 @@ Setting this parameter to "all" or "small" in OSD parameters requires enabling
|
||||
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
|
||||
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
|
||||
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
|
||||
vitastor-disk tried to do that by default, first checking/disabling drive cache.
|
||||
If it can't disable drive cache, OSD get initialized with "none".
|
||||
|
||||
TLDR: For optimal performance, set immediate_commit to "all" if you only use
|
||||
SSDs with supercapacitor-based power loss protection (nonvolatile
|
||||
write-through cache) for both data and journals in the whole Vitastor
|
||||
cluster. Set it to "small" if you only use such SSDs for journals. Leave
|
||||
empty if your drives have write-back cache.
|
||||
|
@@ -57,18 +57,9 @@ amplification) и эффективность распределения нагр
|
||||
## immediate_commit
|
||||
|
||||
- Тип: строка
|
||||
- Значение по умолчанию: all
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Одно из значений "none", "small" или "all". Глобальное значение, может быть
|
||||
переопределено [на уровне пула](pool.ru.md#immediate_commit).
|
||||
|
||||
Данный параметр тоже важен для производительности.
|
||||
|
||||
Вкратце: значение по умолчанию "all" оптимально для всех серверных SSD с
|
||||
суперконденсаторами и также для большинства HDD. "none" и "small" имеет смысл
|
||||
устанавливать только при использовании SSD настольного класса без
|
||||
суперконденсаторов или дисков с медленным неотключаемым кэшем записи.
|
||||
Проверьте настройку immediate_commit своих OSD в выводе команды [ls-osd](../usage/cli.ru.md#ls-osd).
|
||||
Ещё один важный для производительности параметр.
|
||||
|
||||
Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
|
||||
секунду) при простой случайной записи без сбросов кэша. Однако они очень
|
||||
@@ -89,7 +80,7 @@ Power Loss Protection" - одинаково быстрые и со сбросо
|
||||
эффективно утилизировать настольные SSD.
|
||||
|
||||
Данный параметр влияет как раз на это. Когда он установлен в значение "all",
|
||||
кластер Vitastor мгновенно фиксирует каждое изменение на физические
|
||||
весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
|
||||
носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
|
||||
знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
|
||||
по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
|
||||
@@ -115,3 +106,10 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
|
||||
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
|
||||
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
|
||||
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
|
||||
|
||||
Итого, вкратце: для оптимальной производительности установите
|
||||
immediate_commit в значение "all", если вы используете в кластере только SSD
|
||||
с суперконденсаторами и для данных, и для журналов. Если вы используете
|
||||
такие SSD для всех журналов, но не для данных - можете установить параметр
|
||||
в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
|
||||
оставьте параметр пустым.
|
||||
|
@@ -8,14 +8,6 @@
|
||||
|
||||
These parameters only apply to Monitors.
|
||||
|
||||
- [use_antietcd](#use_antietcd)
|
||||
- [enable_prometheus](#enable_prometheus)
|
||||
- [mon_http_port](#mon_http_port)
|
||||
- [mon_http_ip](#mon_http_ip)
|
||||
- [mon_https_cert](#mon_https_cert)
|
||||
- [mon_https_key](#mon_https_key)
|
||||
- [mon_https_client_auth](#mon_https_client_auth)
|
||||
- [mon_https_ca](#mon_https_ca)
|
||||
- [etcd_mon_ttl](#etcd_mon_ttl)
|
||||
- [etcd_mon_timeout](#etcd_mon_timeout)
|
||||
- [etcd_mon_retries](#etcd_mon_retries)
|
||||
@@ -25,87 +17,6 @@ These parameters only apply to Monitors.
|
||||
- [placement_levels](#placement_levels)
|
||||
- [use_old_pg_combinator](#use_old_pg_combinator)
|
||||
|
||||
## use_antietcd
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
Enable experimental built-in etcd replacement (clustered key-value database):
|
||||
[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
|
||||
|
||||
When set to true, monitor runs internal antietcd automatically if it finds
|
||||
a network interface with an IP address matching one of addresses in the
|
||||
`etcd_address` configuration option (in `/etc/vitastor/vitastor.conf` or in
|
||||
the monitor command line). If there are multiple matching addresses, it also
|
||||
checks `antietcd_port` and antietcd is started for address with matching port.
|
||||
By default, antietcd accepts connection on the selected IP address, but it
|
||||
can also be overridden manually in the `antietcd_ip` option.
|
||||
|
||||
When antietcd is started, monitor stores cluster metadata itself and exposes
|
||||
a etcd-compatible REST API. On disk, these metadata are stored in
|
||||
`/var/lib/vitastor/mon_2379.json.gz` (can be overridden in antietcd_data_file
|
||||
or antietcd_data_dir options). All other antietcd parameters
|
||||
(see [here](https://git.yourcmc.ru/vitalif/antietcd/)) except node_id,
|
||||
cluster, cluster_key, persist_filter, stale_read can also be set in
|
||||
Vitastor configuration with `antietcd_` prefix.
|
||||
|
||||
You can dump/load data to or from antietcd using Antietcd `anticli` tool:
|
||||
|
||||
```
|
||||
npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
|
||||
npm exec anticli -e http://antietcd:2379/v3 load < dump.json
|
||||
```
|
||||
|
||||
## enable_prometheus
|
||||
|
||||
- Type: boolean
|
||||
- Default: true
|
||||
|
||||
Enable built-in Prometheus metrics exporter at mon_http_port (8060 by default).
|
||||
|
||||
Note that only the active (master) monitor exposes metrics, others return
|
||||
HTTP 503. So you should add all monitor URLs to your Prometheus job configuration.
|
||||
|
||||
Grafana dashboard suitable for this exporter is here: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
|
||||
|
||||
## mon_http_port
|
||||
|
||||
- Type: integer
|
||||
- Default: 8060
|
||||
|
||||
HTTP port for monitors to listen on (including metrics exporter)
|
||||
|
||||
## mon_http_ip
|
||||
|
||||
- Type: string
|
||||
|
||||
IP address for monitors to listen on (all addresses by default)
|
||||
|
||||
## mon_https_cert
|
||||
|
||||
- Type: string
|
||||
|
||||
Path to PEM SSL certificate file for monitor to listen using HTTPS
|
||||
|
||||
## mon_https_key
|
||||
|
||||
- Type: string
|
||||
|
||||
Path to PEM SSL private key file for monitor to listen using HTTPS
|
||||
|
||||
## mon_https_client_auth
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
Enable HTTPS client certificate-based authorization for monitor connections
|
||||
|
||||
## mon_https_ca
|
||||
|
||||
- Type: string
|
||||
|
||||
Path to CA certificate for client HTTPS authorization
|
||||
|
||||
## etcd_mon_ttl
|
||||
|
||||
- Type: seconds
|
||||
|
@@ -8,14 +8,6 @@
|
||||
|
||||
Данные параметры используются только мониторами Vitastor.
|
||||
|
||||
- [use_antietcd](#use_antietcd)
|
||||
- [enable_prometheus](#enable_prometheus)
|
||||
- [mon_http_port](#mon_http_port)
|
||||
- [mon_http_ip](#mon_http_ip)
|
||||
- [mon_https_cert](#mon_https_cert)
|
||||
- [mon_https_key](#mon_https_key)
|
||||
- [mon_https_client_auth](#mon_https_client_auth)
|
||||
- [mon_https_ca](#mon_https_ca)
|
||||
- [etcd_mon_ttl](#etcd_mon_ttl)
|
||||
- [etcd_mon_timeout](#etcd_mon_timeout)
|
||||
- [etcd_mon_retries](#etcd_mon_retries)
|
||||
@@ -25,89 +17,6 @@
|
||||
- [placement_levels](#placement_levels)
|
||||
- [use_old_pg_combinator](#use_old_pg_combinator)
|
||||
|
||||
## use_antietcd
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Включить экспериментальный встроенный заменитель etcd (кластерную БД ключ-значение):
|
||||
[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
|
||||
|
||||
Если параметр установлен в true, монитор запускает antietcd автоматически,
|
||||
если обнаруживает сетевой интерфейс с одним из адресов, указанных в опции
|
||||
конфигурации `etcd_address` (в `/etc/vitastor/vitastor.conf` или в опциях
|
||||
командной строки монитора). Если таких адресов несколько, также проверяется
|
||||
опция `antietcd_port` и antietcd запускается для адреса с соответствующим
|
||||
портом. По умолчанию antietcd принимает подключения по выбранному совпадающему
|
||||
IP, но его также можно определить вручную опцией `antietcd_ip`.
|
||||
|
||||
При запуске antietcd монитор сам хранит центральные метаданные кластера и
|
||||
выставляет etcd-совместимое REST API. На диске эти метаданные хранятся в файле
|
||||
`/var/lib/vitastor/mon_2379.json.gz` (можно переопределить параметрами
|
||||
antietcd_data_file или antietcd_data_dir). Все остальные параметры antietcd
|
||||
(смотрите [по ссылке](https://git.yourcmc.ru/vitalif/antietcd/)), за исключением
|
||||
node_id, cluster, cluster_key, persist_filter, stale_read также можно задавать
|
||||
в конфигурации Vitastor с префиксом `antietcd_`.
|
||||
|
||||
Вы можете выгружать/загружать данные в или из antietcd с помощью его инструмента
|
||||
`anticli`:
|
||||
|
||||
```
|
||||
npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
|
||||
npm exec anticli -e http://antietcd:2379/v3 load < dump.json
|
||||
```
|
||||
|
||||
## enable_prometheus
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: true
|
||||
|
||||
Включить встроенный Prometheus-экспортер метрик на порту mon_http_port (по умолчанию 8060).
|
||||
|
||||
Обратите внимание, что метрики выставляет только активный (главный) монитор, остальные
|
||||
возвращают статус HTTP 503, поэтому вам следует добавлять адреса всех мониторов
|
||||
в задание по сбору метрик Prometheus.
|
||||
|
||||
Дашборд для Grafana, подходящий для этого экспортера: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
|
||||
|
||||
## mon_http_port
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 8060
|
||||
|
||||
Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
|
||||
|
||||
## mon_http_ip
|
||||
|
||||
- Тип: строка
|
||||
|
||||
IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
|
||||
|
||||
## mon_https_cert
|
||||
|
||||
- Тип: строка
|
||||
|
||||
Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
|
||||
|
||||
## mon_https_key
|
||||
|
||||
- Тип: строка
|
||||
|
||||
Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
|
||||
|
||||
## mon_https_client_auth
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
|
||||
|
||||
## mon_https_ca
|
||||
|
||||
- Тип: строка
|
||||
|
||||
Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
|
||||
|
||||
## etcd_mon_ttl
|
||||
|
||||
- Тип: секунды
|
||||
|
@@ -10,7 +10,6 @@ These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||
initialization and can be changed - either with an OSD restart or, for some of
|
||||
them, even without restarting by updating configuration in etcd.
|
||||
|
||||
- [osd_iothread_count](#osd_iothread_count)
|
||||
- [etcd_report_interval](#etcd_report_interval)
|
||||
- [etcd_stats_interval](#etcd_stats_interval)
|
||||
- [run_primary](#run_primary)
|
||||
@@ -62,18 +61,6 @@ them, even without restarting by updating configuration in etcd.
|
||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||
|
||||
## osd_iothread_count
|
||||
|
||||
- Type: integer
|
||||
- Default: 0
|
||||
|
||||
TCP network I/O thread count for OSD. When non-zero, a single OSD process
|
||||
may handle more TCP I/O, but at a cost of increased latency because thread
|
||||
switching overhead occurs. RDMA isn't affected by this option.
|
||||
|
||||
Because of latency, instead of enabling OSD I/O threads it's recommended to
|
||||
just create multiple OSDs per disk, or use RDMA.
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
- Type: seconds
|
||||
|
@@ -11,7 +11,6 @@
|
||||
момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
|
||||
изменения конфигурации в etcd.
|
||||
|
||||
- [osd_iothread_count](#osd_iothread_count)
|
||||
- [etcd_report_interval](#etcd_report_interval)
|
||||
- [etcd_stats_interval](#etcd_stats_interval)
|
||||
- [run_primary](#run_primary)
|
||||
@@ -63,19 +62,6 @@
|
||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||
|
||||
## osd_iothread_count
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 0
|
||||
|
||||
Число отдельных потоков для обработки ввода-вывода через TCP-сеть на
|
||||
стороне OSD. Включение опции позволяет каждому отдельному OSD передавать
|
||||
по сети больше данных, но ухудшает задержку из-за накладных расходов
|
||||
переключения потоков. На работу RDMA опция не влияет.
|
||||
|
||||
Из-за задержек вместо включения потоков ввода-вывода OSD рекомендуется
|
||||
просто создавать по несколько OSD на каждом диске, или использовать RDMA.
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
- Тип: секунды
|
||||
|
@@ -1,32 +1,3 @@
|
||||
- name: client_iothread_count
|
||||
type: int
|
||||
default: 0
|
||||
online: false
|
||||
info: |
|
||||
Number of separate threads for handling TCP network I/O at client library
|
||||
side. Enabling 4 threads usually allows to increase peak performance of each
|
||||
client from approx. 2-3 to 7-8 GByte/s linear read/write and from approx.
|
||||
100-150 to 400 thousand iops, but at the same time it increases latency.
|
||||
Latency increase depends on CPU: with CPU power saving disabled latency
|
||||
only increases by ~10 us (equivalent to Q=1 iops decrease from 10500 to 9500),
|
||||
with CPU power saving enabled it may be as high as 500 us (equivalent to Q=1
|
||||
iops decrease from 2000 to 1000). RDMA isn't affected by this option.
|
||||
|
||||
It's recommended to enable client I/O threads if you don't use RDMA and want
|
||||
to increase peak client performance.
|
||||
info_ru: |
|
||||
Число отдельных потоков для обработки ввода-вывода через TCP сеть на стороне
|
||||
клиентской библиотеки. Включение 4 потоков обычно позволяет поднять пиковую
|
||||
производительность каждого клиента примерно с 2-3 до 7-8 Гбайт/с линейного
|
||||
чтения/записи и примерно с 100-150 до 400 тысяч операций ввода-вывода в
|
||||
секунду, но ухудшает задержку. Увеличение задержки зависит от процессора:
|
||||
при отключённом энергосбережении CPU это всего ~10 микросекунд (равносильно
|
||||
падению iops с Q=1 с 10500 до 9500), а при включённом это может быть
|
||||
и 500 микросекунд (равносильно падению iops с Q=1 с 2000 до 1000). На работу
|
||||
RDMA данная опция не влияет.
|
||||
|
||||
Рекомендуется включать клиентские потоки ввода-вывода, если вы не используете
|
||||
RDMA и хотите повысить пиковую производительность клиентов.
|
||||
- name: client_retry_interval
|
||||
type: ms
|
||||
min: 10
|
||||
|
@@ -47,24 +47,14 @@
|
||||
Не может быть меньше размера сектора дисков данных OSD.
|
||||
- name: immediate_commit
|
||||
type: string
|
||||
default: all
|
||||
default: false
|
||||
info: |
|
||||
One of "none", "all" or "small". Global value, may be overriden [at pool level](pool.en.md#immediate_commit).
|
||||
|
||||
This parameter is also really important for performance.
|
||||
|
||||
TLDR: default "all" is optimal for server-grade SSDs with supercapacitor-based
|
||||
power loss protection (nonvolatile write-through cache) and also for most HDDs.
|
||||
"none" or "small" should be only selected if you use desktop SSDs without
|
||||
capacitors or drives with slow write-back cache that can't be disabled. Check
|
||||
immediate_commit of your OSDs in [ls-osd](../usage/cli.en.md#ls-osd).
|
||||
|
||||
Detailed explanation:
|
||||
Another parameter which is really important for performance.
|
||||
|
||||
Desktop SSDs are very fast (100000+ iops) for simple random writes
|
||||
without cache flush. However, they are really slow (only around 1000 iops)
|
||||
if you try to fsync() each write, that is, if you want to guarantee that
|
||||
each change gets actually persisted to the physical media.
|
||||
if you try to fsync() each write, that is, when you want to guarantee that
|
||||
each change gets immediately persisted to the physical media.
|
||||
|
||||
Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
|
||||
"Supercapacitor-based Power Loss Protection", on the other hand, are equally
|
||||
@@ -76,8 +66,8 @@
|
||||
efficiently utilize desktop SSDs by postponing fsync until the client calls
|
||||
it explicitly.
|
||||
|
||||
This is what this parameter regulates. When it's set to "all" Vitastor
|
||||
cluster commits each change to disks immediately and clients just
|
||||
This is what this parameter regulates. When it's set to "all" the whole
|
||||
Vitastor cluster commits each change to disks immediately and clients just
|
||||
ignore fsyncs because they know for sure that they're unneeded. This reduces
|
||||
the amount of network roundtrips performed by clients and improves
|
||||
performance. So it's always better to use server grade SSDs with
|
||||
@@ -100,19 +90,14 @@
|
||||
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
|
||||
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
|
||||
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
|
||||
vitastor-disk tried to do that by default, first checking/disabling drive cache.
|
||||
If it can't disable drive cache, OSD get initialized with "none".
|
||||
|
||||
TLDR: For optimal performance, set immediate_commit to "all" if you only use
|
||||
SSDs with supercapacitor-based power loss protection (nonvolatile
|
||||
write-through cache) for both data and journals in the whole Vitastor
|
||||
cluster. Set it to "small" if you only use such SSDs for journals. Leave
|
||||
empty if your drives have write-back cache.
|
||||
info_ru: |
|
||||
Одно из значений "none", "small" или "all". Глобальное значение, может быть
|
||||
переопределено [на уровне пула](pool.ru.md#immediate_commit).
|
||||
|
||||
Данный параметр тоже важен для производительности.
|
||||
|
||||
Вкратце: значение по умолчанию "all" оптимально для всех серверных SSD с
|
||||
суперконденсаторами и также для большинства HDD. "none" и "small" имеет смысл
|
||||
устанавливать только при использовании SSD настольного класса без
|
||||
суперконденсаторов или дисков с медленным неотключаемым кэшем записи.
|
||||
Проверьте настройку immediate_commit своих OSD в выводе команды [ls-osd](../usage/cli.ru.md#ls-osd).
|
||||
Ещё один важный для производительности параметр.
|
||||
|
||||
Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
|
||||
секунду) при простой случайной записи без сбросов кэша. Однако они очень
|
||||
@@ -133,7 +118,7 @@
|
||||
эффективно утилизировать настольные SSD.
|
||||
|
||||
Данный параметр влияет как раз на это. Когда он установлен в значение "all",
|
||||
кластер Vitastor мгновенно фиксирует каждое изменение на физические
|
||||
весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
|
||||
носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
|
||||
знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
|
||||
по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
|
||||
@@ -159,3 +144,10 @@
|
||||
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
|
||||
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
|
||||
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
|
||||
|
||||
Итого, вкратце: для оптимальной производительности установите
|
||||
immediate_commit в значение "all", если вы используете в кластере только SSD
|
||||
с суперконденсаторами и для данных, и для журналов. Если вы используете
|
||||
такие SSD для всех журналов, но не для данных - можете установить параметр
|
||||
в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
|
||||
оставьте параметр пустым.
|
||||
|
@@ -1,103 +1,3 @@
|
||||
- name: use_antietcd
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Enable experimental built-in etcd replacement (clustered key-value database):
|
||||
[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
|
||||
|
||||
When set to true, monitor runs internal antietcd automatically if it finds
|
||||
a network interface with an IP address matching one of addresses in the
|
||||
`etcd_address` configuration option (in `/etc/vitastor/vitastor.conf` or in
|
||||
the monitor command line). If there are multiple matching addresses, it also
|
||||
checks `antietcd_port` and antietcd is started for address with matching port.
|
||||
By default, antietcd accepts connection on the selected IP address, but it
|
||||
can also be overridden manually in the `antietcd_ip` option.
|
||||
|
||||
When antietcd is started, monitor stores cluster metadata itself and exposes
|
||||
a etcd-compatible REST API. On disk, these metadata are stored in
|
||||
`/var/lib/vitastor/mon_2379.json.gz` (can be overridden in antietcd_data_file
|
||||
or antietcd_data_dir options). All other antietcd parameters
|
||||
(see [here](https://git.yourcmc.ru/vitalif/antietcd/)) except node_id,
|
||||
cluster, cluster_key, persist_filter, stale_read can also be set in
|
||||
Vitastor configuration with `antietcd_` prefix.
|
||||
|
||||
You can dump/load data to or from antietcd using Antietcd `anticli` tool:
|
||||
|
||||
```
|
||||
npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
|
||||
npm exec anticli -e http://antietcd:2379/v3 load < dump.json
|
||||
```
|
||||
info_ru: |
|
||||
Включить экспериментальный встроенный заменитель etcd (кластерную БД ключ-значение):
|
||||
[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
|
||||
|
||||
Если параметр установлен в true, монитор запускает antietcd автоматически,
|
||||
если обнаруживает сетевой интерфейс с одним из адресов, указанных в опции
|
||||
конфигурации `etcd_address` (в `/etc/vitastor/vitastor.conf` или в опциях
|
||||
командной строки монитора). Если таких адресов несколько, также проверяется
|
||||
опция `antietcd_port` и antietcd запускается для адреса с соответствующим
|
||||
портом. По умолчанию antietcd принимает подключения по выбранному совпадающему
|
||||
IP, но его также можно определить вручную опцией `antietcd_ip`.
|
||||
|
||||
При запуске antietcd монитор сам хранит центральные метаданные кластера и
|
||||
выставляет etcd-совместимое REST API. На диске эти метаданные хранятся в файле
|
||||
`/var/lib/vitastor/mon_2379.json.gz` (можно переопределить параметрами
|
||||
antietcd_data_file или antietcd_data_dir). Все остальные параметры antietcd
|
||||
(смотрите [по ссылке](https://git.yourcmc.ru/vitalif/antietcd/)), за исключением
|
||||
node_id, cluster, cluster_key, persist_filter, stale_read также можно задавать
|
||||
в конфигурации Vitastor с префиксом `antietcd_`.
|
||||
|
||||
Вы можете выгружать/загружать данные в или из antietcd с помощью его инструмента
|
||||
`anticli`:
|
||||
|
||||
```
|
||||
npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
|
||||
npm exec anticli -e http://antietcd:2379/v3 load < dump.json
|
||||
```
|
||||
- name: enable_prometheus
|
||||
type: bool
|
||||
default: true
|
||||
info: |
|
||||
Enable built-in Prometheus metrics exporter at mon_http_port (8060 by default).
|
||||
|
||||
Note that only the active (master) monitor exposes metrics, others return
|
||||
HTTP 503. So you should add all monitor URLs to your Prometheus job configuration.
|
||||
|
||||
Grafana dashboard suitable for this exporter is here: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
|
||||
info_ru: |
|
||||
Включить встроенный Prometheus-экспортер метрик на порту mon_http_port (по умолчанию 8060).
|
||||
|
||||
Обратите внимание, что метрики выставляет только активный (главный) монитор, остальные
|
||||
возвращают статус HTTP 503, поэтому вам следует добавлять адреса всех мониторов
|
||||
в задание по сбору метрик Prometheus.
|
||||
|
||||
Дашборд для Grafana, подходящий для этого экспортера: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
|
||||
- name: mon_http_port
|
||||
type: int
|
||||
default: 8060
|
||||
info: HTTP port for monitors to listen on (including metrics exporter)
|
||||
info_ru: Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
|
||||
- name: mon_http_ip
|
||||
type: string
|
||||
info: IP address for monitors to listen on (all addresses by default)
|
||||
info_ru: IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
|
||||
- name: mon_https_cert
|
||||
type: string
|
||||
info: Path to PEM SSL certificate file for monitor to listen using HTTPS
|
||||
info_ru: Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
|
||||
- name: mon_https_key
|
||||
type: string
|
||||
info: Path to PEM SSL private key file for monitor to listen using HTTPS
|
||||
info_ru: Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
|
||||
- name: mon_https_client_auth
|
||||
type: bool
|
||||
default: false
|
||||
info: Enable HTTPS client certificate-based authorization for monitor connections
|
||||
info_ru: Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
|
||||
- name: mon_https_ca
|
||||
type: string
|
||||
info: Path to CA certificate for client HTTPS authorization
|
||||
info_ru: Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
|
||||
- name: etcd_mon_ttl
|
||||
type: sec
|
||||
min: 5
|
||||
|
@@ -1,21 +1,3 @@
|
||||
- name: osd_iothread_count
|
||||
type: int
|
||||
default: 0
|
||||
info: |
|
||||
TCP network I/O thread count for OSD. When non-zero, a single OSD process
|
||||
may handle more TCP I/O, but at a cost of increased latency because thread
|
||||
switching overhead occurs. RDMA isn't affected by this option.
|
||||
|
||||
Because of latency, instead of enabling OSD I/O threads it's recommended to
|
||||
just create multiple OSDs per disk, or use RDMA.
|
||||
info_ru: |
|
||||
Число отдельных потоков для обработки ввода-вывода через TCP-сеть на
|
||||
стороне OSD. Включение опции позволяет каждому отдельному OSD передавать
|
||||
по сети больше данных, но ухудшает задержку из-за накладных расходов
|
||||
переключения потоков. На работу RDMA опция не влияет.
|
||||
|
||||
Из-за задержек вместо включения потоков ввода-вывода OSD рекомендуется
|
||||
просто создавать по несколько OSD на каждом диске, или использовать RDMA.
|
||||
- name: etcd_report_interval
|
||||
type: sec
|
||||
default: 5
|
||||
|
@@ -16,6 +16,8 @@
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
|
||||
stable version from 0.9.x branch instead of 1.x
|
||||
- For Debian 10 (Buster) also enable backports repository:
|
||||
`deb http://deb.debian.org/debian buster-backports main`
|
||||
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
||||
## CentOS
|
||||
|
@@ -16,6 +16,8 @@
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
|
||||
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
|
||||
- Для Debian 10 (Buster) также включите репозиторий backports:
|
||||
`deb http://deb.debian.org/debian buster-backports main`
|
||||
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
||||
## CentOS
|
||||
|
@@ -17,10 +17,10 @@ To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported
|
||||
- Restart pvedaemon: `systemctl restart pvedaemon`
|
||||
|
||||
`/etc/pve/storage.cfg` example (the only required option is vitastor_pool, all others
|
||||
are listed below with their default values; `vitastor_ssd` is Proxmox storage pool id):
|
||||
are listed below with their default values):
|
||||
|
||||
```
|
||||
vitastor: vitastor_ssd
|
||||
vitastor: vitastor
|
||||
# pool to put new images into
|
||||
vitastor_pool testpool
|
||||
# path to the configuration file
|
||||
|
@@ -16,10 +16,10 @@
|
||||
- Перезапустите демон Proxmox: `systemctl restart pvedaemon`
|
||||
|
||||
Пример `/etc/pve/storage.cfg` (единственная обязательная опция - vitastor_pool, все остальные
|
||||
перечислены внизу для понимания значений по умолчанию; `vitastor_ssd` - имя хранилища в Proxmox):
|
||||
перечислены внизу для понимания значений по умолчанию):
|
||||
|
||||
```
|
||||
vitastor: vitastor_ssd
|
||||
vitastor: vitastor
|
||||
# Пул, в который будут помещаться образы дисков
|
||||
vitastor_pool testpool
|
||||
# Путь к файлу конфигурации
|
||||
|
@@ -34,8 +34,6 @@
|
||||
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
|
||||
- [Intelligent recovery auto-tuning](../config/osd.en.md#recovery_tune_interval)
|
||||
- [Clustered file system](../usage/nfs.en.md#vitastorfs)
|
||||
- [Experimental internal etcd replacement - antietcd](../config/monitor.en.md#use_antietcd)
|
||||
- [Built-in Prometheus metric exporter](../config/monitor.en.md#enable_prometheus)
|
||||
|
||||
## Plugins and tools
|
||||
|
||||
|
@@ -36,8 +36,6 @@
|
||||
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
|
||||
- [Интеллектуальная автоподстройка скорости восстановления](../config/osd.ru.md#recovery_tune_interval)
|
||||
- [Кластерная файловая система](../usage/nfs.ru.md#vitastorfs)
|
||||
- [Экспериментальная встроенная замена etcd - antietcd](../config/monitor.ru.md#use_antietcd)
|
||||
- [Встроенный Prometheus-экспортер метрик](../config/monitor.ru.md#enable_prometheus)
|
||||
|
||||
## Драйверы и инструменты
|
||||
|
||||
|
@@ -68,6 +68,10 @@ On the monitor hosts:
|
||||
but some free unpartitioned space must be available because the script creates new partitions for journals.
|
||||
- You can change OSD configuration in units or in `vitastor.conf`.
|
||||
Check [Configuration Reference](../config.en.md) for parameter descriptions.
|
||||
- If all your drives have capacitors, and even if not, but if you ran `vitastor-disk`
|
||||
without `--disable_data_fsync off` at the first step, then put the following
|
||||
setting into etcd: \
|
||||
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
||||
- Start all OSDs: `systemctl start vitastor.target`
|
||||
|
||||
## Create a pool
|
||||
@@ -84,10 +88,6 @@ For EC pools the configuration should look like the following:
|
||||
vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
|
||||
```
|
||||
|
||||
Add `--immediate_commit none` if you added `--disable_data_fsync off` at the OSD
|
||||
initialization step, or if `vitastor-disk` complained about impossibility to
|
||||
disable drive cache.
|
||||
|
||||
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
||||
|
||||
If you use HDDs you should also add `"block_size": 1048576` to pool configuration.
|
||||
|
@@ -69,6 +69,11 @@
|
||||
для журналов, на SSD должно быть доступно свободное нераспределённое место.
|
||||
- Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Описания параметров
|
||||
смотрите в [справке по конфигурации](../config.ru.md).
|
||||
- Если все ваши диски - серверные с конденсаторами, и даже если нет, но при этом
|
||||
вы не добавляли опцию `--disable_data_fsync off` на первом шаге, а `vitastor-disk`
|
||||
не ругался на невозможность отключения кэша дисков, пропишите следующую настройку
|
||||
в глобальную конфигурацию в etcd: \
|
||||
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`.
|
||||
- Запустите все OSD: `systemctl start vitastor.target`
|
||||
|
||||
## Создайте пул
|
||||
@@ -85,10 +90,6 @@ vitastor-cli create-pool testpool --pg_size 2 --pg_count 256
|
||||
vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
|
||||
```
|
||||
|
||||
Добавьте также опцию `--immediate_commit none`, если вы добавляли `--disable_data_fsync off`
|
||||
на этапе инициализации OSD, либо если `vitastor-disk` ругался на невозможность отключения
|
||||
кэша дисков.
|
||||
|
||||
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
|
||||
|
||||
Если вы используете HDD-диски, то добавьте в конфигурацию пулов опцию `"block_size": 1048576`.
|
||||
|
@@ -42,7 +42,7 @@ PG state always includes exactly 1 of the following base states:
|
||||
- **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
|
||||
this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
|
||||
or the primary OSD refuses to start this PG (for example, because of wrong block_size),
|
||||
or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/pg/config` in etcd.
|
||||
or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/config/pgs` in etcd.
|
||||
- **starting** — primary OSD has acquired PG lock in etcd, PG is starting.
|
||||
- **peering** — primary OSD requests PG object listings from secondary OSDs and calculates
|
||||
the PG state.
|
||||
@@ -107,17 +107,16 @@ If a PG is active it can also have any number of the following additional states
|
||||
|
||||
## Removing a healthy disk
|
||||
|
||||
Before removing a healthy disk from the cluster set its OSD weight(s) to 0 to
|
||||
move data away. To do that, run `vitastor-cli modify-osd --reweight 0 <НОМЕР_OSD>`.
|
||||
|
||||
Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
|
||||
|
||||
Zero weight can also be put manually into etcd key `/vitastor/config/osd/<НОМЕР_OSD>`, for example:
|
||||
Befor removing a healthy disk from the cluster set its OSD weight(s) to 0 to
|
||||
move data away. To do that, add `"reweight":0` to etcd key `/vitastor/config/osd/<OSD_NUMBER>`.
|
||||
For example:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
|
||||
```
|
||||
|
||||
Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
|
||||
|
||||
## Removing a failed disk
|
||||
|
||||
If a disk is already dead, its OSD(s) are likely already stopped.
|
||||
@@ -150,7 +149,7 @@ POOL_ID=1
|
||||
ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
|
||||
perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
|
||||
for i in $(seq 1 $PG_COUNT); do
|
||||
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'
|
||||
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
|
||||
done
|
||||
```
|
||||
|
||||
@@ -169,51 +168,21 @@ Upgrading is performed without stopping clients (VMs/containers), you just need
|
||||
upgrade and restart servers one by one. However, ideally you should restart VMs too
|
||||
to make them use the new version of the client library.
|
||||
|
||||
### 1.1.x to 1.2.0
|
||||
Exceptions (specific upgrade instructions):
|
||||
- Upgrading <= 1.1.x to 1.2.0 or later, if you use EC n+k with k>=2, is recommended
|
||||
to be performed with full downtime: first you should stop all clients, then all OSDs,
|
||||
then upgrade and start everything back — because versions before 1.2.0 have several
|
||||
bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
|
||||
- Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
|
||||
upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
|
||||
without this intermediate step, client I/O will hang until the end of upgrade process.
|
||||
- Upgrading from <= 0.5.x to >= 0.6.x is not supported.
|
||||
|
||||
Upgrading version <= 1.1.x to version >= 1.2.0, if you use EC n+k with k>=2, is recommended
|
||||
to be performed with full downtime: first you should stop all clients, then all OSDs,
|
||||
then upgrade and start everything back — because versions before 1.2.0 have several
|
||||
bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
|
||||
|
||||
### 0.8.7 to 0.9.0
|
||||
|
||||
Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
|
||||
upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
|
||||
without this intermediate step, client I/O will hang until the end of upgrade process.
|
||||
|
||||
### 0.5.x to 0.6.x
|
||||
|
||||
Upgrading from <= 0.5.x to >= 0.6.x is not supported.
|
||||
|
||||
## Downgrade
|
||||
|
||||
Downgrade are also allowed freely, except the following specific instructions:
|
||||
|
||||
### 1.8.0 to 1.7.1
|
||||
|
||||
Before downgrading from version >= 1.8.0 to version <= 1.7.1
|
||||
you have to copy /vitastor/pg/config etcd key to /vitastor/config/pgs:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://... get --print-value-only /vitastor/pg/config | \
|
||||
etcdctl --endpoints=http://... put /vitastor/config/pgs
|
||||
```
|
||||
|
||||
Then you can just install older packages and restart all services.
|
||||
|
||||
If you performed downgrade without first copying that key, run "add all OSDs into the
|
||||
history records of all PGs" from [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration).
|
||||
|
||||
### 1.0.0 to 0.9.x
|
||||
|
||||
Version 1.0.0 has a new disk format, so OSDs initialized on 1.0.0 or later can't
|
||||
be rolled back to 0.9.x or previous versions.
|
||||
|
||||
### 0.8.0 to 0.7.x
|
||||
|
||||
Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
|
||||
start with older versions (0.4.x - 0.7.x). :-)
|
||||
Rollback:
|
||||
- Version 1.0.0 has a new disk format, so OSDs initiaziled on 1.0.0 can't be rolled
|
||||
back to 0.9.x or previous versions.
|
||||
- Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
|
||||
start with 0.7.x or 0.6.x. :-)
|
||||
|
||||
## OSD memory usage
|
||||
|
||||
|
@@ -42,7 +42,7 @@
|
||||
- **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
|
||||
(если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
|
||||
назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
|
||||
либо PG остановлена монитором через флаг `pause: true` в `/vitastor/pg/config` в etcd.
|
||||
либо PG остановлена монитором через флаг `pause: true` в `/vitastor/config/pgs` в etcd.
|
||||
- **starting** — первичный OSD захватил блокировку PG в etcd, PG запускается.
|
||||
- **peering** — первичный OSD опрашивает вторичные OSD на предмет списков объектов данной PG и рассчитывает её состояние.
|
||||
- **repeering** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **peering**.
|
||||
@@ -105,16 +105,14 @@ PG должны очень быстро переходить из них в др
|
||||
## Удаление исправного диска
|
||||
|
||||
Перед удалением исправного диска из кластера установите его OSD вес в 0, чтобы убрать с него данные.
|
||||
Для этого выполните команду `vitastor-cli modify-osd --reweight 0 <НОМЕР_OSD>`.
|
||||
|
||||
Дождитесь завершения перебалансировки данных, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
|
||||
|
||||
Также вес 0 можно прописать вручную прямо в etcd в ключ `/vitastor/config/osd/<НОМЕР_OSD>`, например:
|
||||
Для этого добавьте в ключ `/vitastor/config/osd/<НОМЕР_OSD>` в etcd значение `"reweight":0`, например:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
|
||||
```
|
||||
|
||||
Дождитесь завершения ребаланса, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
|
||||
|
||||
## Удаление неисправного диска
|
||||
|
||||
Если диск уже умер, его OSD, скорее всего, уже будет/будут остановлен(ы).
|
||||
@@ -147,7 +145,7 @@ POOL_ID=1
|
||||
ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
|
||||
perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
|
||||
for i in $(seq 1 $PG_COUNT); do
|
||||
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'
|
||||
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
|
||||
done
|
||||
```
|
||||
|
||||
@@ -166,51 +164,21 @@ done
|
||||
достаточно обновлять серверы по одному. Однако, конечно, чтобы запущенные виртуальные машины
|
||||
начали использовать новую версию клиентской библиотеки, их тоже нужно перезапустить.
|
||||
|
||||
### 1.1.x -> 1.2.0
|
||||
Исключения (особые указания при обновлении):
|
||||
- Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
|
||||
рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
|
||||
потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
|
||||
могли приводить к некорректному чтению данных в деградированных EC-пулах.
|
||||
- Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
|
||||
нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
|
||||
Иначе клиентский ввод-вывод зависнет до завершения обновления.
|
||||
- Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
|
||||
|
||||
Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
|
||||
рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
|
||||
потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
|
||||
могли приводить к некорректному чтению данных в деградированных EC-пулах.
|
||||
|
||||
### 0.8.7 -> 0.9.0
|
||||
|
||||
Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
|
||||
нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
|
||||
Иначе клиентский ввод-вывод зависнет до завершения обновления.
|
||||
|
||||
### 0.5.x -> 0.6.x
|
||||
|
||||
Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
|
||||
|
||||
## Откат версии
|
||||
|
||||
Откат (понижение версии) тоже свободно разрешён, кроме указанных ниже случаев:
|
||||
|
||||
### 1.8.0 -> 1.7.1
|
||||
|
||||
Перед понижением версии с >= 1.8.0 до <= 1.7.1 вы должны скопировать ключ
|
||||
etcd `/vitastor/pg/config` в `/vitastor/config/pgs`:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://... get --print-value-only /vitastor/pg/config | \
|
||||
etcdctl --endpoints=http://... put /vitastor/config/pgs
|
||||
```
|
||||
|
||||
После этого можно просто установить более старые пакеты и перезапустить все сервисы.
|
||||
|
||||
Если вы откатили версию, не скопировав предварительно этот ключ - выполните "добавление всех
|
||||
OSD в исторические записи всех PG" из раздела [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов).
|
||||
|
||||
### 1.0.0 -> 0.9.x
|
||||
|
||||
В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
|
||||
нельзя откатить до версии 0.9.x и более ранних.
|
||||
|
||||
### 0.8.0 -> 0.7.x
|
||||
|
||||
В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD не запустятся на
|
||||
более ранних версиях (0.4.x - 0.7.x). :-)
|
||||
Откат:
|
||||
- В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
|
||||
нельзя откатить до версии 0.9.x и более ранних.
|
||||
- В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD нельзя откатить
|
||||
до 0.7.x или 0.6.x. :-)
|
||||
|
||||
## Потребление памяти OSD
|
||||
|
||||
|
@@ -24,10 +24,6 @@ It supports the following commands:
|
||||
- [fix](#fix)
|
||||
- [alloc-osd](#alloc-osd)
|
||||
- [rm-osd](#rm-osd)
|
||||
- [osd-tree](#osd-tree)
|
||||
- [ls-osd](#ls-osd)
|
||||
- [modify-osd](#modify-osd)
|
||||
- [pg-list](#pg-list)
|
||||
- [create-pool](#create-pool)
|
||||
- [modify-pool](#modify-pool)
|
||||
- [ls-pools](#ls-pools)
|
||||
@@ -178,7 +174,6 @@ Remove inode data without changing metadata.
|
||||
--wait-list Retrieve full objects listings before starting to remove objects.
|
||||
Requires more memory, but allows to show correct removal progress.
|
||||
--min-offset Purge only data starting with specified offset.
|
||||
--max-offset Purge only data before specified offset.
|
||||
```
|
||||
|
||||
## merge-data
|
||||
@@ -251,82 +246,6 @@ Refuses to remove OSDs with data without `--force` and `--allow-data-loss`.
|
||||
With `--dry-run` only checks if deletion is possible without data loss and
|
||||
redundancy degradation.
|
||||
|
||||
## osd-tree
|
||||
|
||||
`vitastor-cli osd-tree [-l|--long]`
|
||||
|
||||
Show current OSD tree, optionally with I/O statistics if -l is specified.
|
||||
|
||||
Example output:
|
||||
|
||||
```
|
||||
TYPE NAME UP SIZE USED% TAGS WEIGHT BLOCK BITMAP IMM NOOUT
|
||||
host kaveri
|
||||
disk nvme0n1p1
|
||||
osd 3 down 100G 0 % abc,kaveri 1 128k 4k none -
|
||||
osd 4 down 100G 0 % 1 128k 4k none -
|
||||
disk nvme1n1p1
|
||||
osd 5 down 100G 0 % abc,kaveri 1 128k 4k none -
|
||||
osd 6 down 100G 0 % 1 128k 4k none -
|
||||
host stump
|
||||
osd 1 up 100G 37.29 % osdone 1 128k 4k all -
|
||||
osd 2 up 100G 26.8 % abc 1 128k 4k all -
|
||||
osd 7 up 100G 21.84 % 1 128k 4k all -
|
||||
osd 8 up 100G 21.63 % 1 128k 4k all -
|
||||
osd 9 up 100G 20.69 % 1 128k 4k all -
|
||||
osd 10 up 100G 21.61 % 1 128k 4k all -
|
||||
osd 11 up 100G 21.53 % 1 128k 4k all -
|
||||
osd 12 up 100G 22.4 % 1 128k 4k all -
|
||||
```
|
||||
|
||||
## ls-osd
|
||||
|
||||
`vitastor-cli osds|ls-osd|osd-ls [-l|--long]`
|
||||
|
||||
Show current OSDs as list, optionally with I/O statistics if -l is specified.
|
||||
|
||||
Example output:
|
||||
|
||||
```
|
||||
OSD PARENT UP SIZE USED% TAGS WEIGHT BLOCK BITMAP IMM NOOUT
|
||||
3 kaveri/nvme0n1p1 down 100G 0 % globl,kaveri 1 128k 4k none -
|
||||
4 kaveri/nvme0n1p1 down 100G 0 % 1 128k 4k none -
|
||||
5 kaveri/nvme1n1p1 down 100G 0 % globl,kaveri 1 128k 4k none -
|
||||
6 kaveri/nvme1n1p1 down 100G 0 % 1 128k 4k none -
|
||||
1 stump up 100G 37.29 % osdone 1 128k 4k all -
|
||||
2 stump up 100G 26.8 % globl 1 128k 4k all -
|
||||
7 stump up 100G 21.84 % 1 128k 4k all -
|
||||
8 stump up 100G 21.63 % 1 128k 4k all -
|
||||
9 stump up 100G 20.69 % 1 128k 4k all -
|
||||
10 stump up 100G 21.61 % 1 128k 4k all -
|
||||
11 stump up 100G 21.53 % 1 128k 4k all -
|
||||
12 stump up 100G 22.4 % 1 128k 4k all -
|
||||
```
|
||||
|
||||
## modify-osd
|
||||
|
||||
`vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>`
|
||||
|
||||
Set OSD reweight, tags or noout flag. See detail description in [OSD config documentation](../config/pool.en.md#osd-settings).
|
||||
|
||||
## pg-list
|
||||
|
||||
`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
|
||||
|
||||
List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:
|
||||
|
||||
```
|
||||
--pool <pool name or number> Only list PGs of the given pool.
|
||||
--min <min pg number> Only list PGs with number >= min.
|
||||
--max <max pg number> Only list PGs with number <= max.
|
||||
```
|
||||
|
||||
Examples:
|
||||
|
||||
`vitastor-cli pg-list active+degraded`
|
||||
|
||||
`vitastor-cli pg-list ^active`
|
||||
|
||||
## create-pool
|
||||
|
||||
`vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
|
||||
|
@@ -23,10 +23,6 @@ vitastor-cli - интерфейс командной строки для адм
|
||||
- [merge-data](#merge-data)
|
||||
- [alloc-osd](#alloc-osd)
|
||||
- [rm-osd](#rm-osd)
|
||||
- [osd-tree](#osd-tree)
|
||||
- [ls-osd](#ls-osd)
|
||||
- [modify-osd](#modify-osd)
|
||||
- [pg-list](#pg-list)
|
||||
- [create-pool](#create-pool)
|
||||
- [modify-pool](#modify-pool)
|
||||
- [ls-pools](#ls-pools)
|
||||
@@ -186,7 +182,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
--wait-list Сначала запросить полный листинг объектов, а потом начать удалять.
|
||||
Требует больше памяти, но позволяет правильно печатать прогресс удаления.
|
||||
--min-offset Удалять только данные, начиная с заданного смещения.
|
||||
--max-offset Удалять только данные до (исключительно) заданного смещения.
|
||||
```
|
||||
|
||||
## merge-data
|
||||
@@ -268,83 +263,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
С опцией `--dry-run` только проверяет, возможно ли удаление без потери данных и деградации
|
||||
избыточности.
|
||||
|
||||
## osd-tree
|
||||
|
||||
`vitastor-cli osd-tree [-l|--long]`
|
||||
|
||||
Показать дерево OSD, со статистикой ввода-вывода, если установлено -l.
|
||||
|
||||
Пример вывода:
|
||||
|
||||
```
|
||||
TYPE NAME UP SIZE USED% TAGS WEIGHT BLOCK BITMAP IMM NOOUT
|
||||
host kaveri
|
||||
disk nvme0n1p1
|
||||
osd 3 down 100G 0 % globl,kaveri 1 128k 4k none -
|
||||
osd 4 down 100G 0 % 1 128k 4k none -
|
||||
disk nvme1n1p1
|
||||
osd 5 down 100G 0 % globl,kaveri 1 128k 4k none -
|
||||
osd 6 down 100G 0 % 1 128k 4k none -
|
||||
host stump
|
||||
osd 1 up 100G 37.29 % osdone 1 128k 4k all -
|
||||
osd 2 up 100G 26.8 % globl 1 128k 4k all -
|
||||
osd 7 up 100G 21.84 % 1 128k 4k all -
|
||||
osd 8 up 100G 21.63 % 1 128k 4k all -
|
||||
osd 9 up 100G 20.69 % 1 128k 4k all -
|
||||
osd 10 up 100G 21.61 % 1 128k 4k all -
|
||||
osd 11 up 100G 21.53 % 1 128k 4k all -
|
||||
osd 12 up 100G 22.4 % 1 128k 4k all -
|
||||
```
|
||||
|
||||
## ls-osd
|
||||
|
||||
`vitastor-cli osds|ls-osd|osd-ls [-l|--long]`
|
||||
|
||||
Показать список OSD, со статистикой ввода-вывода, если установлено -l.
|
||||
|
||||
Пример вывода:
|
||||
|
||||
```
|
||||
OSD PARENT UP SIZE USED% TAGS WEIGHT BLOCK BITMAP IMM NOOUT
|
||||
3 kaveri/nvme0n1p1 down 100G 0 % globl,kaveri 1 128k 4k none -
|
||||
4 kaveri/nvme0n1p1 down 100G 0 % 1 128k 4k none -
|
||||
5 kaveri/nvme1n1p1 down 100G 0 % globl,kaveri 1 128k 4k none -
|
||||
6 kaveri/nvme1n1p1 down 100G 0 % 1 128k 4k none -
|
||||
1 stump up 100G 37.29 % osdone 1 128k 4k all -
|
||||
2 stump up 100G 26.8 % globl 1 128k 4k all -
|
||||
7 stump up 100G 21.84 % 1 128k 4k all -
|
||||
8 stump up 100G 21.63 % 1 128k 4k all -
|
||||
9 stump up 100G 20.69 % 1 128k 4k all -
|
||||
10 stump up 100G 21.61 % 1 128k 4k all -
|
||||
11 stump up 100G 21.53 % 1 128k 4k all -
|
||||
12 stump up 100G 22.4 % 1 128k 4k all -
|
||||
```
|
||||
|
||||
## modify-osd
|
||||
|
||||
`vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>`
|
||||
|
||||
Установить вес OSD, теги или флаг noout. Смотрите подробное описание в [документации настроек OSD](../config/pool.ru.md#настройки-osd).
|
||||
|
||||
## pg-list
|
||||
|
||||
`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
|
||||
|
||||
Вывести список PG с состояними, удовлетворяющими любому из переданных фильтров (^ или !
|
||||
в начале фильтра означает отрицание). Опции:
|
||||
|
||||
```
|
||||
--pool <pool name or number> Only list PGs of the given pool.
|
||||
--min <min pg number> Only list PGs with number >= min.
|
||||
--max <max pg number> Only list PGs with number <= max.
|
||||
```
|
||||
|
||||
Примеры:
|
||||
|
||||
`vitastor-cli pg-list active+degraded`
|
||||
|
||||
`vitastor-cli pg-list ^active`
|
||||
|
||||
## create-pool
|
||||
|
||||
`vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
|
||||
|
@@ -11,8 +11,6 @@ Vitastor has two file system implementations. Both can be used via `vitastor-nfs
|
||||
Commands:
|
||||
- [mount](#mount)
|
||||
- [start](#start)
|
||||
- [upgrade](#upgrade)
|
||||
- [defrag](#defrag)
|
||||
|
||||
## Pseudo-FS
|
||||
|
||||
@@ -88,6 +86,10 @@ POSIX features currently not implemented in VitastorFS:
|
||||
- Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)
|
||||
|
||||
Other notable missing features which should be addressed in the future:
|
||||
- Defragmentation of "shared" inodes. Files smaller than pool object size (block_size
|
||||
multiplied by data part count if pool is EC) are internally stored in large block
|
||||
volumes sequentially, one after another, and leave garbage after deleting or resizing.
|
||||
Defragmentator will be implemented to collect this garbage.
|
||||
- Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
|
||||
in theory you may hit it if you create and delete a very large number of files
|
||||
- Compaction of the key-value B-Tree. Current implementation never merges or deletes
|
||||
@@ -137,37 +139,6 @@ Start network NFS server. Options:
|
||||
| `--port <PORT>` | use port \<PORT> for NFS services (default is 2049) |
|
||||
| `--portmap 0` | do not listen on port 111 (portmap/rpcbind, requires root) |
|
||||
|
||||
### upgrade
|
||||
|
||||
`vitastor-nfs --fs <NAME> upgrade`
|
||||
|
||||
Upgrade FS metadata. Can be run online, but server(s) should be restarted after upgrade.
|
||||
|
||||
### defrag
|
||||
|
||||
`vitastor-nfs --fs <NAME> defrag [OPTIONS] [--dry-run]`
|
||||
|
||||
Defragment volumes used for small file storage having more than \<defrag_percent> %
|
||||
of data removed. Can be run online.
|
||||
|
||||
In VitastorFS, small files are stored in large "volumes" / "shared inodes" one
|
||||
after another. When you delete or extend such files, they are moved and garbage is left
|
||||
behind. Defragmentation removes garbage and moves data still in use to new volumes.
|
||||
|
||||
Options:
|
||||
|
||||
| <!-- --> | <!-- --> |
|
||||
|--------------------------|------------------------------------------------------------------------ |
|
||||
| --volume_untouched 86400 | Defragment volumes last appended to at least this number of seconds ago |
|
||||
| --defrag_percent 50 | Defragment volumes with at least this % of removed data |
|
||||
| --defrag_block_count 16 | Read this number of pool blocks at once during defrag |
|
||||
| --defrag_iodepth 16 | Move up to this number of files in parallel during defrag |
|
||||
| --trace | Print verbose defragmentation status |
|
||||
| --dry-run | Skip modifications, only print status |
|
||||
| --recalc-stats | Recalculate all volume statistics |
|
||||
| --include-empty | Include old and empty volumes; make sure to restart NFS servers before using it |
|
||||
| --no-rm | Move, but do not delete data |
|
||||
|
||||
## Common options
|
||||
|
||||
| <!-- --> | <!-- --> |
|
||||
|
@@ -11,8 +11,6 @@
|
||||
Команды:
|
||||
- [mount](#mount)
|
||||
- [start](#start)
|
||||
- [upgrade](#upgrade)
|
||||
- [defrag](#defrag)
|
||||
|
||||
## Псевдо-ФС
|
||||
|
||||
@@ -90,6 +88,11 @@ JSON-формате :-). Для инспекции содержимого БД
|
||||
- Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)
|
||||
|
||||
Другие недостающие функции, которые нужно добавить в будущем:
|
||||
- Дефрагментация "общих инодов". На уровне реализации ФС файлы, меньшие, чем размер
|
||||
объекта пула (block_size умножить на число частей данных, если пул EC),
|
||||
упаковываются друг за другом в большие "общие" иноды/тома. Если такие файлы удалять
|
||||
или увеличивать, они перемещаются и оставляют за собой "мусор", вот тут-то и нужен
|
||||
дефрагментатор.
|
||||
- Переиспользование номеров инодов. В текущей реализации номера инодов всё время
|
||||
увеличиваются, так что в теории вы можете упереться в лимит, если насоздаёте
|
||||
и наудаляете больше, чем 2^48 файлов.
|
||||
@@ -142,40 +145,6 @@ JSON-формате :-). Для инспекции содержимого БД
|
||||
| `--port <PORT>` | использовать порт \<PORT> для NFS-сервисов (по умолчанию 2049) |
|
||||
| `--portmap 0` | отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий) |
|
||||
|
||||
### upgrade
|
||||
|
||||
`vitastor-nfs --fs <NAME> upgrade`
|
||||
|
||||
Обновить метаданные ФС. Можно запускать онлайн (при запущенных серверах NFS), но после выполнения их всё
|
||||
же желательно перезапустить.
|
||||
|
||||
### defrag
|
||||
|
||||
`vitastor-nfs --fs <NAME> defrag [OPTIONS] [--dry-run]`
|
||||
|
||||
Дефрагментировать тома, используемые для хранения мелких файлов, в которых более, чем
|
||||
<defrag_percent> процентов данных удалено. Можно запускать онлайн.
|
||||
|
||||
На уровне реализации ФС файлы, меньшие, чем размер объекта пула (block_size умножить на число
|
||||
частей данных, если пул EC), упаковываются друг за другом в большие "тома" / "общие иноды".
|
||||
Когда такие файлы удаляются или увеличиваются, они перемещаются и оставляют за собой "мусор".
|
||||
|
||||
При дефрагментации мусор удаляется, а всё ещё используемые данные перемещаются в новые тома.
|
||||
|
||||
Опции:
|
||||
|
||||
| <!-- --> | <!-- --> |
|
||||
|--------------------------|------------------------------------------------------------------------ |
|
||||
| --volume_untouched 86400 | Дефрагментировать только тома, в которые уже не писали это число секунд |
|
||||
| --defrag_percent 50 | Дефрагментировать только тома, в которых этот % данных удалён |
|
||||
| --defrag_block_count 16 | Читать это количество блоков пула за один раз |
|
||||
| --defrag_iodepth 16 | Перемещать одновременно до этого числа файлов |
|
||||
| --trace | Печатать детальную статистику дефрагментации |
|
||||
| --dry-run | Не производить никаких изменений, только описать выполняемые действия |
|
||||
| --recalc-stats | Пересчитать и сохранить статистику всех томов |
|
||||
| --include-empty | Дефрагментировать старые и пустые тома; обязательно перезапустите NFS-сервера после использования этой опции |
|
||||
| --no-rm | Перемещать, но не удалять данные |
|
||||
|
||||
## Общие опции
|
||||
|
||||
| <!-- --> | <!-- --> |
|
||||
|
@@ -1,188 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const AntiEtcd = require('antietcd');
|
||||
|
||||
const vitastor_persist_filter = require('./vitastor_persist_filter.js');
|
||||
const { b64, local_ips } = require('./utils.js');
|
||||
|
||||
class AntiEtcdAdapter
|
||||
{
|
||||
static async start_antietcd(config)
|
||||
{
|
||||
let antietcd;
|
||||
if (config.use_antietcd)
|
||||
{
|
||||
let cluster = config.etcd_address;
|
||||
if (!(cluster instanceof Array))
|
||||
cluster = cluster ? (''+(cluster||'')).split(/,+/) : [];
|
||||
cluster = Object.keys(cluster.reduce((a, url) =>
|
||||
{
|
||||
a[url.toLowerCase().replace(/^(https?:\/\/)/, '').replace(/\/.*$/, '')] = true;
|
||||
return a;
|
||||
}, {}));
|
||||
const cfg_port = config.antietcd_port;
|
||||
const is_local = local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
|
||||
const selected = cluster.map(s => s.split(':', 2)).filter(ip => is_local[ip[0]] && (!cfg_port || ip[1] == cfg_port));
|
||||
if (selected.length > 1)
|
||||
{
|
||||
console.error('More than 1 etcd_address matches local IPs, please specify port');
|
||||
process.exit(1);
|
||||
}
|
||||
else if (selected.length == 1)
|
||||
{
|
||||
const antietcd_config = {
|
||||
ip: selected[0][0],
|
||||
port: selected[0][1],
|
||||
data: config.antietcd_data_file || ((config.antietcd_data_dir || '/var/lib/vitastor') + '/mon_'+selected[0][1]+'.json.gz'),
|
||||
persist_filter: vitastor_persist_filter({ vitastor_prefix: config.etcd_prefix || '/vitastor' }),
|
||||
node_id: selected[0][0]+':'+selected[0][1], // node_id = ip:port
|
||||
cluster: (cluster.length == 1 ? null : cluster.reduce((a, c) => { a[c] = "http://"+c; return a; }, {})),
|
||||
cluster_key: (config.etcd_prefix || '/vitastor'),
|
||||
stale_read: 1,
|
||||
log_level: 1,
|
||||
};
|
||||
for (const key in config)
|
||||
{
|
||||
if (key.substr(0, 9) === 'antietcd_')
|
||||
{
|
||||
const noprefix = key.substr(9);
|
||||
if (!(noprefix in antietcd_config) || noprefix == 'ip' || noprefix == 'cluster_key')
|
||||
{
|
||||
antietcd_config[noprefix] = config[key];
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log('Starting Antietcd node '+antietcd_config.node_id);
|
||||
antietcd = new AntiEtcd(antietcd_config);
|
||||
await antietcd.start();
|
||||
}
|
||||
else
|
||||
{
|
||||
console.log('Antietcd is enabled, but etcd_address does not contain local IPs, proceeding without it');
|
||||
}
|
||||
}
|
||||
return antietcd;
|
||||
}
|
||||
|
||||
constructor(mon, antietcd)
|
||||
{
|
||||
this.mon = mon;
|
||||
this.antietcd = antietcd;
|
||||
this.on_leader = [];
|
||||
this.on_change = (st) =>
|
||||
{
|
||||
if (st.state === 'leader')
|
||||
{
|
||||
for (const cb of this.on_leader)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
this.on_leader = [];
|
||||
}
|
||||
};
|
||||
this.antietcd.on('raftchange', this.on_change);
|
||||
}
|
||||
|
||||
parse_config(/*config*/)
|
||||
{
|
||||
}
|
||||
|
||||
stop_watcher()
|
||||
{
|
||||
this.antietcd.off('raftchange', this.on_change);
|
||||
const watch_id = this.watch_id;
|
||||
if (watch_id)
|
||||
{
|
||||
this.watch_id = null;
|
||||
this.antietcd.cancel_watch(watch_id).catch(console.error);
|
||||
}
|
||||
}
|
||||
|
||||
async start_watcher()
|
||||
{
|
||||
if (this.watch_id)
|
||||
{
|
||||
await this.antietcd.cancel_watch(this.watch_id);
|
||||
this.watch_id = null;
|
||||
}
|
||||
const watch_id = await this.antietcd.create_watch({
|
||||
key: b64(this.mon.config.etcd_prefix+'/'),
|
||||
range_end: b64(this.mon.config.etcd_prefix+'0'),
|
||||
start_revision: ''+this.mon.etcd_watch_revision,
|
||||
watch_id: 1,
|
||||
progress_notify: true,
|
||||
}, (message) =>
|
||||
{
|
||||
setImmediate(() => this.mon.on_message(message.result));
|
||||
});
|
||||
console.log('Successfully subscribed to antietcd revision '+this.antietcd.etctree.mod_revision);
|
||||
this.watch_id = watch_id;
|
||||
}
|
||||
|
||||
async become_master()
|
||||
{
|
||||
if (!this.antietcd.cluster)
|
||||
{
|
||||
console.log('Running in non-clustered mode');
|
||||
}
|
||||
else
|
||||
{
|
||||
console.log('Waiting to become master');
|
||||
if (this.antietcd.cluster.raft.state !== 'leader')
|
||||
{
|
||||
await new Promise(ok => this.on_leader.push(ok));
|
||||
}
|
||||
}
|
||||
const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
|
||||
await this.etcd_call('/kv/txn', {
|
||||
success: [ { requestPut: { key: b64(this.mon.config.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.mon.etcd_lease_id } } ],
|
||||
}, this.mon.config.etcd_start_timeout, 0);
|
||||
if (this.antietcd.cluster)
|
||||
{
|
||||
console.log('Became master');
|
||||
}
|
||||
}
|
||||
|
||||
async etcd_call(path, body, timeout, retries)
|
||||
{
|
||||
let retry = 0;
|
||||
if (retries >= 0 && retries < 1)
|
||||
{
|
||||
retries = 1;
|
||||
}
|
||||
let prev = 0;
|
||||
while (retries < 0 || retry < retries)
|
||||
{
|
||||
retry++;
|
||||
if (this.mon.stopped)
|
||||
{
|
||||
throw new Error('Monitor instance is stopped');
|
||||
}
|
||||
try
|
||||
{
|
||||
if (Date.now()-prev < timeout)
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, timeout-(Date.now()-prev)));
|
||||
}
|
||||
prev = Date.now();
|
||||
const res = await this.antietcd.api(path.replace(/^\/+/, '').replace(/\/+$/, '').replace(/\/+/g, '_'), body);
|
||||
if (res.error)
|
||||
{
|
||||
console.error('Failed to query antietcd '+path+' (retry '+retry+'/'+retries+'): '+res.error);
|
||||
}
|
||||
else
|
||||
{
|
||||
return res;
|
||||
}
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
console.error('Failed to query antietcd '+path+' (retry '+retry+'/'+retries+'): '+e.stack);
|
||||
}
|
||||
}
|
||||
throw new Error('Failed to query antietcd ('+retries+' retries)');
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = AntiEtcdAdapter;
|
@@ -3,7 +3,6 @@
|
||||
|
||||
const http = require('http');
|
||||
const WebSocket = require('ws');
|
||||
const { b64, local_ips } = require('./utils.js');
|
||||
|
||||
const MON_STOPPED = 'Monitor instance is stopped';
|
||||
|
||||
@@ -24,7 +23,7 @@ class EtcdAdapter
|
||||
|
||||
parse_etcd_addresses(addrs)
|
||||
{
|
||||
const is_local_ip = local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
|
||||
const is_local_ip = this.mon.local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
|
||||
this.etcd_local = [];
|
||||
this.etcd_urls = [];
|
||||
this.selected_etcd_url = null;
|
||||
@@ -349,4 +348,9 @@ function POST(url, body, timeout)
|
||||
});
|
||||
}
|
||||
|
||||
function b64(str)
|
||||
{
|
||||
return Buffer.from(str).toString('base64');
|
||||
}
|
||||
|
||||
module.exports = EtcdAdapter;
|
||||
|
@@ -6,7 +6,7 @@ const etcd_nonempty_keys = {
|
||||
'config/global': 1,
|
||||
'config/node_placement': 1,
|
||||
'config/pools': 1,
|
||||
'pg/config': 1,
|
||||
'config/pgs': 1,
|
||||
'history/last_clean_pgs': 1,
|
||||
'stats': 1,
|
||||
};
|
||||
@@ -15,8 +15,7 @@ const etcd_allow = new RegExp('^'+[
|
||||
'config/node_placement',
|
||||
'config/pools',
|
||||
'config/osd/[1-9]\\d*',
|
||||
'config/pgs', // old name
|
||||
'pg/config',
|
||||
'config/pgs',
|
||||
'config/inode/[1-9]\\d*/[1-9]\\d*',
|
||||
'osd/state/[1-9]\\d*',
|
||||
'osd/stats/[1-9]\\d*',
|
||||
@@ -25,8 +24,7 @@ const etcd_allow = new RegExp('^'+[
|
||||
'mon/master',
|
||||
'mon/member/[a-f0-9]+',
|
||||
'pg/state/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/stats/[1-9]\\d*/[1-9]\\d*', // old name
|
||||
'pgstats/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/stats/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
||||
'history/last_clean_pgs',
|
||||
'inode/stats/[1-9]\\d*/\\d+',
|
||||
@@ -71,7 +69,7 @@ const etcd_tree = {
|
||||
block_size: 131072,
|
||||
disk_alignment: 4096,
|
||||
bitmap_granularity: 4096,
|
||||
immediate_commit: 'all', // 'none', 'all' or 'small'
|
||||
immediate_commit: false, // 'all' or 'small'
|
||||
// client - configurable online
|
||||
client_max_dirty_bytes: 33554432,
|
||||
client_max_dirty_ops: 1024,
|
||||
@@ -191,7 +189,7 @@ const etcd_tree = {
|
||||
block_size: 131072,
|
||||
bitmap_granularity: 4096,
|
||||
// 'all'/'small'/'none', same as in OSD options
|
||||
immediate_commit: 'all',
|
||||
immediate_commit: 'none',
|
||||
pg_stripe_size: 0,
|
||||
root_node?: 'rack1',
|
||||
// restrict pool to OSDs having all of these tags
|
||||
@@ -207,6 +205,19 @@ const etcd_tree = {
|
||||
osd: {
|
||||
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
|
||||
},
|
||||
/* pgs: {
|
||||
hash: string,
|
||||
items: {
|
||||
<pool_id>: {
|
||||
<pg_id>: {
|
||||
osd_set: [ 1, 2, 3 ],
|
||||
primary: 1,
|
||||
pause: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
}, */
|
||||
pgs: {},
|
||||
/* inode: {
|
||||
<pool_id>: {
|
||||
<inode_t>: {
|
||||
@@ -234,9 +245,6 @@ const etcd_tree = {
|
||||
stats: {
|
||||
/* <osd_num_t>: {
|
||||
time: number, // unix time
|
||||
data_block_size: uint64_t, // bytes
|
||||
bitmap_granularity: uint64_t, // bytes
|
||||
immediate_commit: "all"|"small"|"none",
|
||||
blockstore_ready: boolean,
|
||||
size: uint64_t, // bytes
|
||||
free: uint64_t, // bytes
|
||||
@@ -274,24 +282,11 @@ const etcd_tree = {
|
||||
master: {
|
||||
/* ip: [ string ], id: uint64_t */
|
||||
},
|
||||
member: {
|
||||
standby: {
|
||||
/* <uint64_t>: { ip: [ string ] }, */
|
||||
},
|
||||
},
|
||||
pg: {
|
||||
/* config: {
|
||||
hash: string,
|
||||
items: {
|
||||
<pool_id>: {
|
||||
<pg_id>: {
|
||||
osd_set: [ 1, 2, 3 ],
|
||||
primary: 1,
|
||||
pause: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
}, */
|
||||
config: {},
|
||||
state: {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
@@ -302,6 +297,18 @@ const etcd_tree = {
|
||||
}
|
||||
}, */
|
||||
},
|
||||
stats: {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
object_count: uint64_t,
|
||||
clean_count: uint64_t,
|
||||
misplaced_count: uint64_t,
|
||||
degraded_count: uint64_t,
|
||||
incomplete_count: uint64_t,
|
||||
write_osd_set: osd_num_t[],
|
||||
},
|
||||
}, */
|
||||
},
|
||||
history: {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
@@ -313,18 +320,6 @@ const etcd_tree = {
|
||||
}, */
|
||||
},
|
||||
},
|
||||
pgstats: {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
object_count: uint64_t,
|
||||
clean_count: uint64_t,
|
||||
misplaced_count: uint64_t,
|
||||
degraded_count: uint64_t,
|
||||
incomplete_count: uint64_t,
|
||||
write_osd_set: osd_num_t[],
|
||||
},
|
||||
}, */
|
||||
},
|
||||
inode: {
|
||||
stats: {
|
||||
/* <pool_id>: {
|
||||
|
@@ -1,50 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const fsp = require('fs').promises;
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
|
||||
async function create_http_server(cfg, handler)
|
||||
{
|
||||
let server;
|
||||
if (cfg.mon_https_cert)
|
||||
{
|
||||
const tls = {
|
||||
key: await fsp.readFile(cfg.mon_https_key),
|
||||
cert: await fsp.readFile(cfg.mon_https_cert),
|
||||
};
|
||||
if (cfg.mon_https_ca)
|
||||
{
|
||||
tls.mon_https_ca = await fsp.readFile(cfg.mon_https_ca);
|
||||
}
|
||||
if (cfg.mon_https_client_auth)
|
||||
{
|
||||
tls.requestCert = true;
|
||||
}
|
||||
server = https.createServer(tls, handler);
|
||||
}
|
||||
else
|
||||
{
|
||||
server = http.createServer(handler);
|
||||
}
|
||||
try
|
||||
{
|
||||
let err;
|
||||
server.once('error', e => err = e);
|
||||
server.listen(cfg.mon_http_port || 8060, cfg.mon_http_ip || undefined);
|
||||
if (err)
|
||||
throw err;
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
console.error(
|
||||
'HTTP server disabled because listen at address: '+
|
||||
(cfg.mon_http_ip || '')+':'+(cfg.mon_http_port || 9090)+' failed with error: '+e
|
||||
);
|
||||
return null;
|
||||
}
|
||||
return server;
|
||||
}
|
||||
|
||||
module.exports = { create_http_server };
|
@@ -23,4 +23,4 @@ for (let i = 2; i < process.argv.length; i++)
|
||||
}
|
||||
}
|
||||
|
||||
Mon.run_forever(options).catch(console.error);
|
||||
Mon.run_forever(options);
|
||||
|
252
mon/mon.js
252
mon/mon.js
@@ -1,43 +1,27 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { URL } = require('url');
|
||||
const fs = require('fs');
|
||||
const crypto = require('crypto');
|
||||
const os = require('os');
|
||||
const AntiEtcdAdapter = require('./antietcd_adapter.js');
|
||||
const EtcdAdapter = require('./etcd_adapter.js');
|
||||
const { create_http_server } = require('./http_server.js');
|
||||
const { export_prometheus_metrics } = require('./prometheus.js');
|
||||
const { etcd_tree, etcd_allow, etcd_nonempty_keys } = require('./etcd_schema.js');
|
||||
const { validate_pool_cfg } = require('./pool_config.js');
|
||||
const { sum_op_stats, sum_object_counts, sum_inode_stats, serialize_bigints } = require('./stats.js');
|
||||
const stableStringify = require('./stable-stringify.js');
|
||||
const { scale_pg_history } = require('./pg_utils.js');
|
||||
const { get_osd_tree } = require('./osd_tree.js');
|
||||
const { b64, de64, local_ips } = require('./utils.js');
|
||||
const { recheck_primary, save_new_pgs_txn, generate_pool_pgs } = require('./pg_gen.js');
|
||||
|
||||
class Mon
|
||||
{
|
||||
static async run_forever(config)
|
||||
static run_forever(config)
|
||||
{
|
||||
let mergedConfig = config;
|
||||
if (fs.existsSync(config.config_path||'/etc/vitastor/vitastor.conf'))
|
||||
{
|
||||
const fileConfig = JSON.parse(fs.readFileSync(config.config_path||'/etc/vitastor/vitastor.conf', { encoding: 'utf-8' }));
|
||||
mergedConfig = { ...fileConfig, ...config };
|
||||
}
|
||||
let antietcd = await AntiEtcdAdapter.start_antietcd(mergedConfig);
|
||||
let mon;
|
||||
const run = () =>
|
||||
{
|
||||
console.log('Starting Monitor');
|
||||
const my_mon = new Mon(config);
|
||||
my_mon.etcd = antietcd
|
||||
? new AntiEtcdAdapter(my_mon, antietcd)
|
||||
: new EtcdAdapter(my_mon);
|
||||
my_mon.etcd.parse_config(my_mon.config);
|
||||
mon = my_mon;
|
||||
my_mon.on_die = () =>
|
||||
{
|
||||
@@ -74,57 +58,24 @@ class Mon
|
||||
this.state = JSON.parse(JSON.stringify(etcd_tree));
|
||||
this.prev_stats = { osd_stats: {}, osd_diff: {} };
|
||||
this.recheck_pgs_active = false;
|
||||
this.watcher_active = false;
|
||||
this.old_pg_config = false;
|
||||
this.old_pg_stats_seen = false;
|
||||
this.etcd = new EtcdAdapter(this);
|
||||
this.etcd.parse_config(this.config);
|
||||
}
|
||||
|
||||
async start()
|
||||
{
|
||||
if (this.config.enable_prometheus || !('enable_prometheus' in this.config))
|
||||
{
|
||||
this.http = await create_http_server(this.config, (req, res) =>
|
||||
{
|
||||
const u = new URL(req.url, 'http://'+(req.headers.host || 'localhost'));
|
||||
if (u.pathname.replace(/\/+$/, '') == (this.config.prometheus_path||'/metrics'))
|
||||
{
|
||||
if (!this.watcher_active)
|
||||
{
|
||||
res.writeHead(503);
|
||||
res.write('Monitor is in standby mode. Please retrieve metrics from master monitor instance\n');
|
||||
}
|
||||
else
|
||||
{
|
||||
res.write(export_prometheus_metrics(this.state));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
res.writeHead(404);
|
||||
res.write('Not found. Metrics path: '+(this.config.prometheus_path||'/metrics\n'));
|
||||
}
|
||||
res.end();
|
||||
});
|
||||
this.http_connections = new Set();
|
||||
this.http.on('connection', conn =>
|
||||
{
|
||||
this.http_connections.add(conn);
|
||||
conn.once('close', () => this.http_connections.delete(conn));
|
||||
});
|
||||
}
|
||||
await this.load_config();
|
||||
await this.get_lease();
|
||||
await this.etcd.become_master();
|
||||
await this.load_cluster_state();
|
||||
await this.etcd.start_watcher(this.config.etcd_mon_retries);
|
||||
this.watcher_active = true;
|
||||
for (const pool_id in this.state.config.pools)
|
||||
{
|
||||
if (!this.state.pool.stats[pool_id] ||
|
||||
!Number(this.state.pool.stats[pool_id].pg_real_size))
|
||||
{
|
||||
// Generate missing data in etcd
|
||||
this.state.pg.config.hash = null;
|
||||
this.state.config.pgs.hash = null;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -196,22 +147,17 @@ class Mon
|
||||
this.etcd_watch_revision = BigInt(msg.header.revision)+BigInt(1);
|
||||
for (const e of msg.events||[])
|
||||
{
|
||||
const kv = this.parse_kv(e.kv);
|
||||
const key = kv.key.substr(this.config.etcd_prefix.length);
|
||||
this.parse_kv(e.kv);
|
||||
const key = e.kv.key.substr(this.config.etcd_prefix.length);
|
||||
if (key.substr(0, 11) == '/osd/state/')
|
||||
{
|
||||
stats_changed = true;
|
||||
changed = true;
|
||||
}
|
||||
else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 9) == '/pgstats/' || key.substr(0, 16) == '/osd/inodestats/')
|
||||
else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
|
||||
{
|
||||
stats_changed = true;
|
||||
}
|
||||
else if (key.substr(0, 10) == '/pg/stats/')
|
||||
{
|
||||
this.old_pg_stats_seen = true;
|
||||
stats_changed = true;
|
||||
}
|
||||
else if (key.substr(0, 10) == '/pg/state/')
|
||||
{
|
||||
pg_states_changed = true;
|
||||
@@ -222,7 +168,7 @@ class Mon
|
||||
}
|
||||
if (this.config.verbose)
|
||||
{
|
||||
console.log(JSON.stringify({ ...e, kv: kv || undefined }));
|
||||
console.log(JSON.stringify(e));
|
||||
}
|
||||
}
|
||||
if (pg_states_changed)
|
||||
@@ -292,7 +238,7 @@ class Mon
|
||||
continue next_pool;
|
||||
}
|
||||
}
|
||||
new_clean_pgs.items[pool_id] = this.state.pg.config.items[pool_id];
|
||||
new_clean_pgs.items[pool_id] = this.state.config.pgs.items[pool_id];
|
||||
}
|
||||
this.state.history.last_clean_pgs = new_clean_pgs;
|
||||
await this.etcd.etcd_call('/kv/txn', {
|
||||
@@ -306,7 +252,7 @@ class Mon
|
||||
|
||||
get_mon_state()
|
||||
{
|
||||
return { ip: local_ips(), hostname: os.hostname() };
|
||||
return { ip: this.local_ips(), hostname: os.hostname() };
|
||||
}
|
||||
|
||||
async get_lease()
|
||||
@@ -338,16 +284,6 @@ class Mon
|
||||
async on_stop()
|
||||
{
|
||||
console.log('Stopping Monitor');
|
||||
if (this.http)
|
||||
{
|
||||
await new Promise(ok =>
|
||||
{
|
||||
this.http.close(ok);
|
||||
for (const conn of this.http_connections)
|
||||
conn.destroy();
|
||||
});
|
||||
this.http = null;
|
||||
}
|
||||
this.etcd.stop_watcher();
|
||||
if (this.save_last_clean_timer)
|
||||
{
|
||||
@@ -403,50 +339,6 @@ class Mon
|
||||
this.parse_kv(kv);
|
||||
}
|
||||
}
|
||||
if (Object.keys((this.state.config.pgs||{}).items||{}).length)
|
||||
{
|
||||
// Support seamless upgrade to new OSDs
|
||||
if (!Object.keys((this.state.pg.config||{}).items||{}).length)
|
||||
{
|
||||
const pgs = JSON.stringify(this.state.config.pgs);
|
||||
this.state.pg.config = JSON.parse(pgs);
|
||||
const res = await this.etcd.etcd_call('/kv/txn', {
|
||||
success: [
|
||||
{ requestPut: { key: b64(this.config.etcd_prefix+'/pg/config'), value: b64(pgs) } },
|
||||
],
|
||||
compare: [
|
||||
{ key: b64(this.config.etcd_prefix+'/pg/config'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
|
||||
],
|
||||
}, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
|
||||
if (!res.succeeded)
|
||||
throw new Error('Failed to duplicate old PG config to new PG config');
|
||||
}
|
||||
this.old_pg_config = true;
|
||||
this.old_pg_config_timer = setInterval(() => this.check_clear_old_config().catch(console.error),
|
||||
this.config.old_pg_config_clear_interval||3600000);
|
||||
}
|
||||
}
|
||||
|
||||
async check_clear_old_config()
|
||||
{
|
||||
if (this.old_pg_config && this.old_pg_stats_seen)
|
||||
{
|
||||
this.old_pg_stats_seen = false;
|
||||
return;
|
||||
}
|
||||
if (this.old_pg_config)
|
||||
{
|
||||
await this.etcd.etcd_call('/kv/txn', { success: [
|
||||
{ requestDeleteRange: { key: b64(this.config.etcd_prefix+'/config/pgs') } },
|
||||
{ requestDeleteRange: { key: b64(this.config.etcd_prefix+'/pg/stats/'), range_end: b64(this.config.etcd_prefix+'/pg/stats0') } },
|
||||
] }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
|
||||
this.old_pg_config = false;
|
||||
}
|
||||
if (this.old_pg_config_timer)
|
||||
{
|
||||
clearInterval(this.old_pg_config_timer);
|
||||
this.old_pg_config_timer = null;
|
||||
}
|
||||
}
|
||||
|
||||
all_osds()
|
||||
@@ -457,7 +349,7 @@ class Mon
|
||||
async stop_all_pgs(pool_id)
|
||||
{
|
||||
let has_online = false, paused = true;
|
||||
for (const pg in this.state.pg.config.items[pool_id]||{})
|
||||
for (const pg in this.state.config.pgs.items[pool_id]||{})
|
||||
{
|
||||
// FIXME: Change all (||{}) to ?. (optional chaining) at some point
|
||||
const cur_state = (((this.state.pg.state[pool_id]||{})[pg]||{}).state||[]).join(',');
|
||||
@@ -465,7 +357,7 @@ class Mon
|
||||
{
|
||||
has_online = true;
|
||||
}
|
||||
if (!this.state.pg.config.items[pool_id][pg].pause)
|
||||
if (!this.state.config.pgs.items[pool_id][pg].pause)
|
||||
{
|
||||
paused = false;
|
||||
}
|
||||
@@ -473,7 +365,7 @@ class Mon
|
||||
if (!paused)
|
||||
{
|
||||
console.log('Stopping all PGs for pool '+pool_id+' before changing PG count');
|
||||
const new_cfg = JSON.parse(JSON.stringify(this.state.pg.config));
|
||||
const new_cfg = JSON.parse(JSON.stringify(this.state.config.pgs));
|
||||
for (const pg in new_cfg.items[pool_id])
|
||||
{
|
||||
new_cfg.items[pool_id][pg].pause = true;
|
||||
@@ -481,26 +373,22 @@ class Mon
|
||||
// Check that no OSDs change their state before we pause PGs
|
||||
// Doing this we make sure that OSDs don't wake up in the middle of our "transaction"
|
||||
// and can't see the old PG configuration
|
||||
const checks = [
|
||||
{ key: b64(this.config.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
|
||||
{ key: b64(this.config.etcd_prefix+'/pg/config'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
|
||||
];
|
||||
const checks = [];
|
||||
for (const osd_num of this.all_osds())
|
||||
{
|
||||
const key = b64(this.config.etcd_prefix+'/osd/state/'+osd_num);
|
||||
checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
|
||||
}
|
||||
const txn = {
|
||||
compare: checks,
|
||||
success: [
|
||||
{ requestPut: { key: b64(this.config.etcd_prefix+'/pg/config'), value: b64(JSON.stringify(new_cfg)) } },
|
||||
await this.etcd.etcd_call('/kv/txn', {
|
||||
compare: [
|
||||
{ key: b64(this.config.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
|
||||
{ key: b64(this.config.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
|
||||
...checks,
|
||||
],
|
||||
};
|
||||
if (this.old_pg_config)
|
||||
{
|
||||
txn.success.push({ requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } });
|
||||
}
|
||||
await this.etcd.etcd_call('/kv/txn', txn, this.config.etcd_mon_timeout, 0);
|
||||
success: [
|
||||
{ requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
|
||||
],
|
||||
}, this.config.etcd_mon_timeout, 0);
|
||||
return false;
|
||||
}
|
||||
return !has_online;
|
||||
@@ -528,7 +416,7 @@ class Mon
|
||||
pools: this.state.config.pools,
|
||||
};
|
||||
const tree_hash = sha1hex(stableStringify(tree_cfg));
|
||||
if (this.state.pg.config.hash != tree_hash)
|
||||
if (this.state.config.pgs.hash != tree_hash)
|
||||
{
|
||||
// Something has changed
|
||||
console.log('Pool configuration or OSD tree changed, re-optimizing');
|
||||
@@ -569,10 +457,10 @@ class Mon
|
||||
else
|
||||
{
|
||||
// Nothing changed, but we still want to recheck the distribution of primaries
|
||||
let new_pg_config = recheck_primary(this.state, this.config, up_osds, osd_tree);
|
||||
if (new_pg_config)
|
||||
let new_config_pgs = recheck_primary(this.state, this.config, up_osds, osd_tree);
|
||||
if (new_config_pgs)
|
||||
{
|
||||
const ok = await this.save_pg_config(new_pg_config);
|
||||
const ok = await this.save_pg_config(new_config_pgs);
|
||||
if (ok)
|
||||
console.log('PG configuration successfully changed');
|
||||
else
|
||||
@@ -587,12 +475,12 @@ class Mon
|
||||
|
||||
async apply_pool_pgs(results, up_osds, osd_tree, tree_hash)
|
||||
{
|
||||
for (const pool_id in (this.state.pg.config||{}).items||{})
|
||||
for (const pool_id in (this.state.config.pgs||{}).items||{})
|
||||
{
|
||||
// We should stop all PGs when deleting a pool or changing its PG count
|
||||
if (!this.state.config.pools[pool_id] ||
|
||||
this.state.pg.config.items[pool_id] && this.state.config.pools[pool_id].pg_count !=
|
||||
Object.keys(this.state.pg.config.items[pool_id]).reduce((a, c) => (a < (0|c) ? (0|c) : a), 0))
|
||||
this.state.config.pgs.items[pool_id] && this.state.config.pools[pool_id].pg_count !=
|
||||
Object.keys(this.state.config.pgs.items[pool_id]).reduce((a, c) => (a < (0|c) ? (0|c) : a), 0))
|
||||
{
|
||||
if (!await this.stop_all_pgs(pool_id))
|
||||
{
|
||||
@@ -600,22 +488,22 @@ class Mon
|
||||
}
|
||||
}
|
||||
}
|
||||
const new_pg_config = JSON.parse(JSON.stringify(this.state.pg.config));
|
||||
const new_config_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
|
||||
const etcd_request = { compare: [], success: [] };
|
||||
for (const pool_id in (new_pg_config||{}).items||{})
|
||||
for (const pool_id in (new_config_pgs||{}).items||{})
|
||||
{
|
||||
if (!this.state.config.pools[pool_id])
|
||||
{
|
||||
const prev_pgs = [];
|
||||
for (const pg in new_pg_config.items[pool_id]||{})
|
||||
for (const pg in new_config_pgs.items[pool_id]||{})
|
||||
{
|
||||
prev_pgs[pg-1] = new_pg_config.items[pool_id][pg].osd_set;
|
||||
prev_pgs[pg-1] = new_config_pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
// Also delete pool statistics
|
||||
etcd_request.success.push({ requestDeleteRange: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
save_new_pgs_txn(new_pg_config, etcd_request, this.state, this.config.etcd_prefix,
|
||||
save_new_pgs_txn(new_config_pgs, etcd_request, this.state, this.config.etcd_prefix,
|
||||
this.etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, [], []);
|
||||
}
|
||||
}
|
||||
@@ -624,7 +512,7 @@ class Mon
|
||||
const pool_id = pool_res.pool_id;
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
let pg_history = [];
|
||||
for (const pg in ((this.state.pg.config.items||{})[pool_id]||{}))
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
if (this.state.pg.history[pool_id] &&
|
||||
this.state.pg.history[pool_id][pg])
|
||||
@@ -633,9 +521,9 @@ class Mon
|
||||
}
|
||||
}
|
||||
const real_prev_pgs = [];
|
||||
for (const pg in ((this.state.pg.config.items||{})[pool_id]||{}))
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
real_prev_pgs[pg-1] = [ ...this.state.pg.config.items[pool_id][pg].osd_set ];
|
||||
real_prev_pgs[pg-1] = [ ...this.state.config.pgs.items[pool_id][pg].osd_set ];
|
||||
}
|
||||
if (real_prev_pgs.length > 0 && real_prev_pgs.length != pool_res.pgs.length)
|
||||
{
|
||||
@@ -646,8 +534,8 @@ class Mon
|
||||
pg_history = scale_pg_history(pg_history, real_prev_pgs, pool_res.pgs);
|
||||
// Drop stats
|
||||
etcd_request.success.push({ requestDeleteRange: {
|
||||
key: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'/'),
|
||||
range_end: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'0'),
|
||||
key: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'/'),
|
||||
range_end: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'0'),
|
||||
} });
|
||||
}
|
||||
const stats = {
|
||||
@@ -658,26 +546,22 @@ class Mon
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(stats)),
|
||||
} });
|
||||
save_new_pgs_txn(new_pg_config, etcd_request, this.state, this.config.etcd_prefix,
|
||||
save_new_pgs_txn(new_config_pgs, etcd_request, this.state, this.config.etcd_prefix,
|
||||
this.etcd_watch_revision, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history);
|
||||
}
|
||||
new_pg_config.hash = tree_hash;
|
||||
return await this.save_pg_config(new_pg_config, etcd_request);
|
||||
new_config_pgs.hash = tree_hash;
|
||||
return await this.save_pg_config(new_config_pgs, etcd_request);
|
||||
}
|
||||
|
||||
async save_pg_config(new_pg_config, etcd_request = { compare: [], success: [] })
|
||||
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
|
||||
{
|
||||
etcd_request.compare.push(
|
||||
{ key: b64(this.config.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
|
||||
{ key: b64(this.config.etcd_prefix+'/pg/config'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
|
||||
{ key: b64(this.config.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
|
||||
);
|
||||
etcd_request.success.push(
|
||||
{ requestPut: { key: b64(this.config.etcd_prefix+'/pg/config'), value: b64(JSON.stringify(new_pg_config)) } },
|
||||
{ requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_config_pgs)) } },
|
||||
);
|
||||
if (this.old_pg_config)
|
||||
{
|
||||
etcd_request.success.push({ requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_pg_config)) } });
|
||||
}
|
||||
const txn_res = await this.etcd.etcd_call('/kv/txn', etcd_request, this.config.etcd_mon_timeout, 0);
|
||||
return txn_res.succeeded;
|
||||
}
|
||||
@@ -806,16 +690,15 @@ class Mon
|
||||
{
|
||||
if (!kv || !kv.key)
|
||||
{
|
||||
return kv;
|
||||
return;
|
||||
}
|
||||
kv = { ...kv };
|
||||
kv.key = de64(kv.key);
|
||||
kv.value = kv.value ? de64(kv.value) : null;
|
||||
let key = kv.key.substr(this.config.etcd_prefix.length+1);
|
||||
if (!etcd_allow.exec(key))
|
||||
{
|
||||
console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
|
||||
return kv;
|
||||
return;
|
||||
}
|
||||
try
|
||||
{
|
||||
@@ -824,7 +707,7 @@ class Mon
|
||||
catch (e)
|
||||
{
|
||||
console.log('Bad value in etcd: '+kv.key+' = '+kv.value);
|
||||
return kv;
|
||||
return;
|
||||
}
|
||||
let key_parts = key.split('/');
|
||||
let cur = this.state;
|
||||
@@ -838,14 +721,7 @@ class Mon
|
||||
kv.value = kv.value || {};
|
||||
}
|
||||
const old = cur[key_parts[key_parts.length-1]];
|
||||
if (kv.value == null)
|
||||
{
|
||||
delete cur[key_parts[key_parts.length-1]];
|
||||
}
|
||||
else
|
||||
{
|
||||
cur[key_parts[key_parts.length-1]] = kv.value;
|
||||
}
|
||||
cur[key_parts[key_parts.length-1]] = kv.value;
|
||||
if (key === 'config/global')
|
||||
{
|
||||
this.config = { ...this.fileConfig, ...this.state.config.global, ...this.cliConfig };
|
||||
@@ -881,7 +757,6 @@ class Mon
|
||||
!this.state.osd.stats[osd_num] ? 0 : this.state.osd.stats[osd_num].time+this.config.osd_out_time
|
||||
);
|
||||
}
|
||||
return kv;
|
||||
}
|
||||
|
||||
_die(err)
|
||||
@@ -891,6 +766,33 @@ class Mon
|
||||
this.on_stop().catch(console.error);
|
||||
this.on_die();
|
||||
}
|
||||
|
||||
local_ips(all)
|
||||
{
|
||||
const ips = [];
|
||||
const ifaces = os.networkInterfaces();
|
||||
for (const ifname in ifaces)
|
||||
{
|
||||
for (const iface of ifaces[ifname])
|
||||
{
|
||||
if (iface.family == 'IPv4' && !iface.internal || all)
|
||||
{
|
||||
ips.push(iface.address);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ips;
|
||||
}
|
||||
}
|
||||
|
||||
function b64(str)
|
||||
{
|
||||
return Buffer.from(str).toString('base64');
|
||||
}
|
||||
|
||||
function de64(str)
|
||||
{
|
||||
return Buffer.from(str, 'base64').toString();
|
||||
}
|
||||
|
||||
function sha1hex(str)
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor-mon",
|
||||
"version": "1.7.1",
|
||||
"version": "1.6.1",
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
@@ -9,7 +9,6 @@
|
||||
"author": "Vitaliy Filippov",
|
||||
"license": "UNLICENSED",
|
||||
"dependencies": {
|
||||
"antietcd": "^1.1.0",
|
||||
"sprintf-js": "^1.1.2",
|
||||
"ws": "^7.2.5"
|
||||
},
|
||||
|
@@ -57,7 +57,7 @@ function pick_primary(pool_config, osd_set, up_osds, aff_osds)
|
||||
|
||||
function recheck_primary(state, global_config, up_osds, osd_tree)
|
||||
{
|
||||
let new_pg_config;
|
||||
let new_config_pgs;
|
||||
for (const pool_id in state.config.pools)
|
||||
{
|
||||
const pool_cfg = state.config.pools[pool_id];
|
||||
@@ -69,30 +69,30 @@ function recheck_primary(state, global_config, up_osds, osd_tree)
|
||||
reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
if (!state.pg.config.items[pool_id])
|
||||
if (!state.config.pgs.items[pool_id])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const pg_cfg = state.pg.config.items[pool_id][pg_num];
|
||||
const pg_cfg = state.config.pgs.items[pool_id][pg_num];
|
||||
if (pg_cfg)
|
||||
{
|
||||
const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
|
||||
if (pg_cfg.primary != new_primary)
|
||||
{
|
||||
if (!new_pg_config)
|
||||
if (!new_config_pgs)
|
||||
{
|
||||
new_pg_config = JSON.parse(JSON.stringify(state.pg.config));
|
||||
new_config_pgs = JSON.parse(JSON.stringify(state.config.pgs));
|
||||
}
|
||||
console.log(
|
||||
`Moving pool ${pool_id} (${pool_cfg.name || 'unnamed'}) PG ${pg_num}`+
|
||||
` primary OSD from ${pg_cfg.primary} to ${new_primary}`
|
||||
);
|
||||
new_pg_config.items[pool_id][pg_num].primary = new_primary;
|
||||
new_config_pgs.items[pool_id][pg_num].primary = new_primary;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return new_pg_config;
|
||||
return new_config_pgs;
|
||||
}
|
||||
|
||||
function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
|
||||
@@ -174,7 +174,7 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
|
||||
state.osd.stats,
|
||||
pool_cfg.block_size || global_config.block_size || 131072,
|
||||
pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
|
||||
pool_cfg.immediate_commit || global_config.immediate_commit || 'all'
|
||||
pool_cfg.immediate_commit || global_config.immediate_commit || 'none'
|
||||
);
|
||||
pool_tree = make_hier_tree(global_config, pool_tree);
|
||||
// First try last_clean_pgs to minimize data movement
|
||||
@@ -185,10 +185,10 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
|
||||
}
|
||||
if (!prev_pgs.length)
|
||||
{
|
||||
// Fall back to pg/config if it's empty
|
||||
for (const pg in ((state.pg.config.items||{})[pool_id]||{}))
|
||||
// Fall back to config/pgs if it's empty
|
||||
for (const pg in ((state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = [ ...state.pg.config.items[pool_id][pg].osd_set ];
|
||||
prev_pgs[pg-1] = [ ...state.config.pgs.items[pool_id][pg].osd_set ];
|
||||
}
|
||||
}
|
||||
const old_pg_count = prev_pgs.length;
|
||||
@@ -205,8 +205,8 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
|
||||
ordered: pool_cfg.scheme != 'replicated',
|
||||
};
|
||||
let optimize_result;
|
||||
// Re-shuffle PGs if pg/config.hash is empty
|
||||
if (old_pg_count > 0 && state.pg.config.hash)
|
||||
// Re-shuffle PGs if config/pgs.hash is empty
|
||||
if (old_pg_count > 0 && state.config.pgs.hash)
|
||||
{
|
||||
if (prev_pgs.length != pool_cfg.pg_count)
|
||||
{
|
||||
|
@@ -1,220 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const metric_help =
|
||||
`# HELP vitastor_object_bytes Total size of objects in cluster in bytes
|
||||
# TYPE vitastor_object_bytes gauge
|
||||
# HELP vitastor_object_count Total number of objects in cluster
|
||||
# TYPE vitastor_object_count gauge
|
||||
# HELP vitastor_stat_count Total operation count
|
||||
# TYPE vitastor_stat_count counter
|
||||
# HELP vitastor_stat_usec Total operation latency in usec
|
||||
# TYPE vitastor_stat_usec counter
|
||||
# HELP vitastor_stat_bytes Total operation size in bytes
|
||||
# HELP vitastor_stat_bytes counter
|
||||
|
||||
# HELP vitastor_image_raw_used Image raw used size in bytes
|
||||
# TYPE vitastor_image_raw_used counter
|
||||
# HELP vitastor_image_stat_count Per-image total operation count
|
||||
# TYPE vitastor_image_stat_count counter
|
||||
# HELP vitastor_image_stat_usec Per-image total operation latency
|
||||
# TYPE vitastor_image_stat_usec counter
|
||||
# HELP vitastor_image_stat_bytes Per-image total operation size in bytes
|
||||
# TYPE vitastor_image_stat_bytes counter
|
||||
|
||||
# HELP vitastor_osd_status OSD up/down status
|
||||
# TYPE vitastor_osd_status gauge
|
||||
# HELP vitastor_osd_size_bytes OSD total space in bytes
|
||||
# TYPE vitastor_osd_size_bytes gauge
|
||||
# HELP vitastor_osd_free_bytes OSD free space in bytes
|
||||
# TYPE vitastor_osd_free_bytes gauge
|
||||
# HELP vitastor_osd_stat_count Per-image total operation count
|
||||
# TYPE vitastor_osd_stat_count counter
|
||||
# HELP vitastor_osd_stat_usec Per-image total operation latency
|
||||
# TYPE vitastor_osd_stat_usec counter
|
||||
# HELP vitastor_osd_stat_bytes Per-image total operation size in bytes
|
||||
# TYPE vitastor_osd_stat_bytes counter
|
||||
|
||||
# HELP vitastor_monitor_info Monitor info, 1 is master, 0 is standby
|
||||
# TYPE vitastor_monitor_info gauge
|
||||
|
||||
# HELP vitastor_pool_info Pool configuration (in labels)
|
||||
# TYPE vitastor_pool_info gauge
|
||||
# HELP vitastor_pool_status Pool up/down status
|
||||
# TYPE vitastor_pool_status gauge
|
||||
# HELP vitastor_pool_raw_to_usable Raw to usable space ratio
|
||||
# TYPE vitastor_pool_raw_to_usable gauge
|
||||
# HELP vitastor_pool_space_efficiency Pool space usage efficiency
|
||||
# TYPE vitastor_pool_space_efficiency gauge
|
||||
# HELP vitastor_pool_total_raw_tb Total raw space in pool in TB
|
||||
# TYPE vitastor_pool_total_raw_tb gauge
|
||||
# HELP vitastor_pool_used_raw_tb Used raw space in pool in TB
|
||||
# TYPE vitastor_pool_used_raw_tb gauge
|
||||
# HELP vitastor_pg_count PG counts by state
|
||||
# HELP vitastor_pg_count gauge
|
||||
|
||||
`;
|
||||
|
||||
function export_prometheus_metrics(st)
|
||||
{
|
||||
let res = metric_help;
|
||||
|
||||
// Global statistics
|
||||
|
||||
for (const k in st.stats.object_bytes)
|
||||
{
|
||||
res += `vitastor_object_bytes{object_type="${k}"} ${st.stats.object_bytes[k]}\n`;
|
||||
}
|
||||
|
||||
for (const k in st.stats.object_counts)
|
||||
{
|
||||
res += `vitastor_object_count{object_type="${k}"} ${st.stats.object_counts[k]}\n`;
|
||||
}
|
||||
|
||||
for (const typ of [ 'op', 'subop', 'recovery' ])
|
||||
{
|
||||
for (const op in st.stats[typ+"_stats"]||{})
|
||||
{
|
||||
const op_stat = st.stats[typ+"_stats"][op];
|
||||
for (const key of [ 'count', 'usec', 'bytes' ])
|
||||
{
|
||||
res += `vitastor_stat_${key}{op="${op}",op_type="${typ}"} ${op_stat[key]||0}\n`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Per-image statistics
|
||||
|
||||
for (const pool in st.inode.stats)
|
||||
{
|
||||
for (const inode in st.inode.stats[pool])
|
||||
{
|
||||
const ist = st.inode.stats[pool][inode];
|
||||
const inode_name = ((st.config.inode[pool]||{})[inode]||{}).name||'';
|
||||
const inode_label = `image_name="${addslashes(inode_name)}",inode_num="${inode}",pool_id="${pool}"`;
|
||||
res += `vitastor_image_raw_used{${inode_label}} ${ist.raw_used||0}\n`;
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
for (const k of [ 'count', 'usec', 'bytes' ])
|
||||
{
|
||||
if (ist[op])
|
||||
{
|
||||
res += `vitastor_image_stat_${k}{${inode_label},op="${op}"} ${ist[op][k]||0}\n`;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Per-OSD statistics
|
||||
|
||||
for (const osd in st.osd.stats)
|
||||
{
|
||||
const osd_stat = st.osd.stats[osd];
|
||||
const up = st.osd.state[osd] && st.osd.state[osd].state == 'up' ? 1 : 0;
|
||||
res += `vitastor_osd_status{host="${addslashes(osd_stat.host)}",osd_num="${osd}"} ${up}\n`;
|
||||
res += `vitastor_osd_size_bytes{osd_num="${osd}"} ${osd_stat.size||0}\n`;
|
||||
res += `vitastor_osd_free_bytes{osd_num="${osd}"} ${osd_stat.free||0}\n`;
|
||||
for (const op in osd_stat.op_stats)
|
||||
{
|
||||
const ist = osd_stat.op_stats[op];
|
||||
for (const k of [ 'count', 'usec', 'bytes' ])
|
||||
{
|
||||
res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="op"} ${ist[k]||0}\n`;
|
||||
}
|
||||
}
|
||||
for (const op in osd_stat.subop_stats)
|
||||
{
|
||||
const ist = osd_stat.subop_stats[op];
|
||||
for (const k of [ 'count', 'usec', 'bytes' ])
|
||||
{
|
||||
res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="subop"} ${ist[k]||0}\n`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Monitor statistics
|
||||
|
||||
for (const mon_id in st.mon.member)
|
||||
{
|
||||
const mon = st.mon.member[mon_id];
|
||||
const master = st.mon.master && st.mon.master.id == mon_id ? 1 : 0;
|
||||
const ip = (mon.ip instanceof Array ? mon.ip[0] : mon.ip) || '';
|
||||
res += `vitastor_monitor_info{monitor_hostname="${addslashes(mon.hostname)}",monitor_id="${mon_id}",monitor_ip="${addslashes(ip)}"} ${master}\n`;
|
||||
}
|
||||
|
||||
// Per-pool statistics
|
||||
|
||||
for (const pool_id in st.config.pools)
|
||||
{
|
||||
const pool_cfg = st.config.pools[pool_id];
|
||||
const pool_label = `pool_id="${pool_id}",pool_name="${addslashes(pool_cfg.name)}"`;
|
||||
const pool_stat = st.pool.stats[pool_id];
|
||||
res += `vitastor_pool_info{${pool_label}`+
|
||||
`,pool_scheme="${addslashes(pool_cfg.scheme)}"`+
|
||||
`,pg_size="${pool_cfg.pg_size||0}",pg_minsize="${pool_cfg.pg_minsize||0}"`+
|
||||
`,parity_chunks="${pool_cfg.parity_chunks||0}",pg_count="${pool_cfg.pg_count||0}"`+
|
||||
`,failure_domain="${addslashes(pool_cfg.failure_domain)}"`+
|
||||
`} 1\n`;
|
||||
if (!pool_stat)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
res += `vitastor_pool_raw_to_usable{${pool_label}} ${pool_stat.raw_to_usable||0}\n`;
|
||||
res += `vitastor_pool_space_efficiency{${pool_label}} ${pool_stat.space_efficiency||0}\n`;
|
||||
res += `vitastor_pool_total_raw_tb{${pool_label}} ${pool_stat.total_raw_tb||0}\n`;
|
||||
res += `vitastor_pool_used_raw_tb{${pool_label}} ${pool_stat.used_raw_tb||0}\n`;
|
||||
|
||||
// PG states and pool up/down status
|
||||
const real_pg_count = (Object.keys(((st.pg.config||{}).items||{})[pool_id]||{}).length) || (0|pool_cfg.pg_count);
|
||||
const per_state = {
|
||||
active: 0,
|
||||
starting: 0,
|
||||
peering: 0,
|
||||
incomplete: 0,
|
||||
repeering: 0,
|
||||
stopping: 0,
|
||||
offline: 0,
|
||||
degraded: 0,
|
||||
has_inconsistent: 0,
|
||||
has_corrupted: 0,
|
||||
has_incomplete: 0,
|
||||
has_degraded: 0,
|
||||
has_misplaced: 0,
|
||||
has_unclean: 0,
|
||||
has_invalid: 0,
|
||||
left_on_dead: 0,
|
||||
scrubbing: 0,
|
||||
};
|
||||
const pool_pg_states = st.pg.state[pool_id] || {};
|
||||
for (let i = 1; i <= real_pg_count; i++)
|
||||
{
|
||||
if (!pool_pg_states[i])
|
||||
{
|
||||
per_state['offline'] = 1 + (per_state['offline']|0);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (const st_name of pool_pg_states[i].state)
|
||||
{
|
||||
per_state[st_name] = 1 + (per_state[st_name]|0);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const st_name in per_state)
|
||||
{
|
||||
res += `vitastor_pg_count{pg_state="${st_name}",${pool_label}} ${per_state[st_name]}\n`;
|
||||
}
|
||||
const pool_active = per_state['active'] >= real_pg_count ? 1 : 0;
|
||||
res += `vitastor_pool_status{${pool_label}} ${pool_active}\n`;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
function addslashes(str)
|
||||
{
|
||||
return ((str||'')+'').replace(/(["\n\\])/g, "\\$1"); // escape " \n \
|
||||
}
|
||||
|
||||
module.exports = { export_prometheus_metrics };
|
File diff suppressed because it is too large
Load Diff
38
mon/stats.js
38
mon/stats.js
@@ -3,10 +3,10 @@
|
||||
|
||||
function derive_osd_stats(st, prev, prev_diff)
|
||||
{
|
||||
const diff = prev_diff || { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
||||
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
|
||||
{
|
||||
return diff;
|
||||
return prev_diff || diff;
|
||||
}
|
||||
const timediff = BigInt(st.time*1000 - prev.time*1000);
|
||||
for (const op in st.op_stats||{})
|
||||
@@ -17,7 +17,8 @@ function derive_osd_stats(st, prev, prev_diff)
|
||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
diff.op_stats[op] = { ...c, bps: n > 0 ? b*1000n/timediff : 0n, iops: n > 0 ? n*1000n/timediff : 0n, lat: n > 0 ? us/n : 0n };
|
||||
if (n > 0)
|
||||
diff.op_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff, lat: us/n };
|
||||
}
|
||||
for (const op in st.subop_stats||{})
|
||||
{
|
||||
@@ -26,7 +27,8 @@ function derive_osd_stats(st, prev, prev_diff)
|
||||
c = { usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
|
||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
diff.subop_stats[op] = { ...c, iops: n > 0 ? n*1000n/timediff : 0n, lat: n > 0 ? us/n : 0n };
|
||||
if (n > 0)
|
||||
diff.subop_stats[op] = { ...c, iops: n*1000n/timediff, lat: us/n };
|
||||
}
|
||||
for (const op in st.recovery_stats||{})
|
||||
{
|
||||
@@ -35,7 +37,8 @@ function derive_osd_stats(st, prev, prev_diff)
|
||||
c = { bytes: BigInt(c.bytes||0), count: BigInt(c.count||0) };
|
||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
diff.recovery_stats[op] = { ...c, bps: n > 0 ? b*1000n/timediff : 0n, iops: n > 0 ? n*1000n/timediff : 0n };
|
||||
if (n > 0)
|
||||
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
|
||||
}
|
||||
for (const pool_id in st.inode_stats||{})
|
||||
{
|
||||
@@ -50,9 +53,9 @@ function derive_osd_stats(st, prev, prev_diff)
|
||||
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
|
||||
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
|
||||
inode_diff[op] = {
|
||||
bps: n > 0 ? (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff : 0n,
|
||||
iops: n > 0 ? n*1000n/timediff : 0n,
|
||||
lat: n > 0 ? (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/n : 0n,
|
||||
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
|
||||
iops: n*1000n/timediff,
|
||||
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -71,7 +74,7 @@ function sum_op_stats(all_osd, prev_stats)
|
||||
);
|
||||
prev_stats.osd_stats[osd] = cur;
|
||||
}
|
||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: { degraded: {}, misplaced: {} } };
|
||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
// Sum derived values instead of deriving summed
|
||||
for (const osd in all_osd.state)
|
||||
{
|
||||
@@ -100,19 +103,10 @@ function sum_object_counts(state, global_config)
|
||||
{
|
||||
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
|
||||
const object_bytes = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
|
||||
let pgstats = state.pgstats;
|
||||
if (state.pg.stats)
|
||||
{
|
||||
// Merge with old stats for seamless transition to new stats
|
||||
for (const pool_id in state.pg.stats)
|
||||
{
|
||||
pgstats[pool_id] = { ...(state.pg.stats[pool_id] || {}), ...(pgstats[pool_id] || {}) };
|
||||
}
|
||||
}
|
||||
for (const pool_id in pgstats)
|
||||
for (const pool_id in state.pg.stats)
|
||||
{
|
||||
let object_size = 0;
|
||||
for (const osd_num of pgstats[pool_id].write_osd_set||[])
|
||||
for (const osd_num of state.pg.stats[pool_id].write_osd_set||[])
|
||||
{
|
||||
if (osd_num && state.osd.stats[osd_num] && state.osd.stats[osd_num].block_size)
|
||||
{
|
||||
@@ -130,9 +124,9 @@ function sum_object_counts(state, global_config)
|
||||
object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
|
||||
}
|
||||
object_size = BigInt(object_size);
|
||||
for (const pg_num in pgstats[pool_id])
|
||||
for (const pg_num in state.pg.stats[pool_id])
|
||||
{
|
||||
const st = pgstats[pool_id][pg_num];
|
||||
const st = state.pg.stats[pool_id][pg_num];
|
||||
if (st)
|
||||
{
|
||||
for (const k in object_counts)
|
||||
|
37
mon/utils.js
37
mon/utils.js
@@ -1,37 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const os = require('os');
|
||||
|
||||
function local_ips(all)
|
||||
{
|
||||
const ips = [];
|
||||
const ifaces = os.networkInterfaces();
|
||||
for (const ifname in ifaces)
|
||||
{
|
||||
for (const iface of ifaces[ifname])
|
||||
{
|
||||
if (iface.family == 'IPv4' && !iface.internal || all)
|
||||
{
|
||||
ips.push(iface.address);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ips;
|
||||
}
|
||||
|
||||
function b64(str)
|
||||
{
|
||||
return Buffer.from(str).toString('base64');
|
||||
}
|
||||
|
||||
function de64(str)
|
||||
{
|
||||
return Buffer.from(str, 'base64').toString();
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
b64,
|
||||
de64,
|
||||
local_ips,
|
||||
};
|
@@ -1,49 +0,0 @@
|
||||
// AntiEtcd persistence filter for Vitastor
|
||||
// (c) Vitaliy Filippov, 2024
|
||||
// License: Mozilla Public License 2.0 or Vitastor Network Public License 1.1
|
||||
|
||||
function vitastor_persist_filter(cfg)
|
||||
{
|
||||
const prefix = cfg.vitastor_prefix || '/vitastor';
|
||||
return (key, value) =>
|
||||
{
|
||||
if (key.substr(0, prefix.length+'/osd/stats/'.length) == prefix+'/osd/stats/')
|
||||
{
|
||||
if (value)
|
||||
{
|
||||
try
|
||||
{
|
||||
value = JSON.parse(value);
|
||||
value = JSON.stringify({
|
||||
bitmap_granularity: value.bitmap_granularity || undefined,
|
||||
data_block_size: value.data_block_size || undefined,
|
||||
host: value.host || undefined,
|
||||
immediate_commit: value.immediate_commit || undefined,
|
||||
});
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
console.error('invalid JSON in '+key+' = '+value+': '+e);
|
||||
value = '{}';
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
value = undefined;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
else if (key.substr(0, prefix.length+'/osd/'.length) == prefix+'/osd/' ||
|
||||
key.substr(0, prefix.length+'/inode/stats/'.length) == prefix+'/inode/stats/' ||
|
||||
key.substr(0, prefix.length+'/pg/stats/'.length) == prefix+'/pg/stats/' || // old name
|
||||
key.substr(0, prefix.length+'/pgstats/'.length) == prefix+'/pgstats/' ||
|
||||
key.substr(0, prefix.length+'/pool/stats/'.length) == prefix+'/pool/stats/' ||
|
||||
key == prefix+'/stats')
|
||||
{
|
||||
return undefined;
|
||||
}
|
||||
return value;
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = vitastor_persist_filter;
|
@@ -1,80 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "addon.h"
|
||||
|
||||
// Initialize the node addon
|
||||
NAN_MODULE_INIT(InitAddon)
|
||||
{
|
||||
// vitastor.Client
|
||||
|
||||
v8::Local<v8::FunctionTemplate> tpl = Nan::New<v8::FunctionTemplate>(NodeVitastor::Create);
|
||||
tpl->SetClassName(Nan::New("Client").ToLocalChecked());
|
||||
tpl->InstanceTemplate()->SetInternalFieldCount(1);
|
||||
|
||||
Nan::SetPrototypeMethod(tpl, "read", NodeVitastor::Read);
|
||||
Nan::SetPrototypeMethod(tpl, "write", NodeVitastor::Write);
|
||||
Nan::SetPrototypeMethod(tpl, "sync", NodeVitastor::Sync);
|
||||
Nan::SetPrototypeMethod(tpl, "read_bitmap", NodeVitastor::ReadBitmap);
|
||||
//Nan::SetPrototypeMethod(tpl, "destroy", NodeVitastor::Destroy);
|
||||
|
||||
Nan::Set(target, Nan::New("Client").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
|
||||
|
||||
// vitastor.Image (opened image)
|
||||
|
||||
tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorImage::Create);
|
||||
tpl->SetClassName(Nan::New("Image").ToLocalChecked());
|
||||
tpl->InstanceTemplate()->SetInternalFieldCount(1);
|
||||
|
||||
Nan::SetPrototypeMethod(tpl, "read", NodeVitastorImage::Read);
|
||||
Nan::SetPrototypeMethod(tpl, "write", NodeVitastorImage::Write);
|
||||
Nan::SetPrototypeMethod(tpl, "sync", NodeVitastorImage::Sync);
|
||||
Nan::SetPrototypeMethod(tpl, "get_info", NodeVitastorImage::GetInfo);
|
||||
Nan::SetPrototypeMethod(tpl, "read_bitmap", NodeVitastorImage::ReadBitmap);
|
||||
|
||||
Nan::Set(target, Nan::New("Image").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
|
||||
|
||||
// vitastor.KV
|
||||
|
||||
tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorKV::Create);
|
||||
tpl->SetClassName(Nan::New("KV").ToLocalChecked());
|
||||
tpl->InstanceTemplate()->SetInternalFieldCount(1);
|
||||
|
||||
Nan::SetPrototypeMethod(tpl, "open", NodeVitastorKV::Open);
|
||||
Nan::SetPrototypeMethod(tpl, "set_config", NodeVitastorKV::SetConfig);
|
||||
Nan::SetPrototypeMethod(tpl, "close", NodeVitastorKV::Close);
|
||||
Nan::SetPrototypeMethod(tpl, "get_size", NodeVitastorKV::GetSize);
|
||||
Nan::SetPrototypeMethod(tpl, "get", NodeVitastorKV::Get);
|
||||
Nan::SetPrototypeMethod(tpl, "get_cached", NodeVitastorKV::GetCached);
|
||||
Nan::SetPrototypeMethod(tpl, "set", NodeVitastorKV::Set);
|
||||
Nan::SetPrototypeMethod(tpl, "del", NodeVitastorKV::Del);
|
||||
Nan::SetPrototypeMethod(tpl, "list", NodeVitastorKV::List);
|
||||
|
||||
Nan::Set(target, Nan::New("KV").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
|
||||
|
||||
Nan::Set(target, Nan::New("ENOENT").ToLocalChecked(), Nan::New<v8::Int32>(-ENOENT));
|
||||
Nan::Set(target, Nan::New("EIO").ToLocalChecked(), Nan::New<v8::Int32>(-EIO));
|
||||
Nan::Set(target, Nan::New("EINVAL").ToLocalChecked(), Nan::New<v8::Int32>(-EINVAL));
|
||||
Nan::Set(target, Nan::New("EROFS").ToLocalChecked(), Nan::New<v8::Int32>(-EROFS));
|
||||
Nan::Set(target, Nan::New("ENOSPC").ToLocalChecked(), Nan::New<v8::Int32>(-ENOSPC));
|
||||
Nan::Set(target, Nan::New("EINTR").ToLocalChecked(), Nan::New<v8::Int32>(-EINTR));
|
||||
Nan::Set(target, Nan::New("EILSEQ").ToLocalChecked(), Nan::New<v8::Int32>(-EILSEQ));
|
||||
Nan::Set(target, Nan::New("ENOTBLK").ToLocalChecked(), Nan::New<v8::Int32>(-ENOTBLK));
|
||||
Nan::Set(target, Nan::New("ENOSYS").ToLocalChecked(), Nan::New<v8::Int32>(-ENOSYS));
|
||||
Nan::Set(target, Nan::New("EAGAIN").ToLocalChecked(), Nan::New<v8::Int32>(-EAGAIN));
|
||||
|
||||
// Listing handle
|
||||
|
||||
tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorKVListing::Create);
|
||||
tpl->SetClassName(Nan::New("KVListing").ToLocalChecked());
|
||||
tpl->InstanceTemplate()->SetInternalFieldCount(1);
|
||||
|
||||
Nan::SetPrototypeMethod(tpl, "next", NodeVitastorKVListing::Next);
|
||||
Nan::SetPrototypeMethod(tpl, "close", NodeVitastorKVListing::Close);
|
||||
|
||||
Nan::Set(target, Nan::New("KVListing").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
|
||||
|
||||
NodeVitastorKV::listing_class.Reset(Nan::GetFunction(tpl).ToLocalChecked());
|
||||
}
|
||||
|
||||
NODE_MODULE(addon, (void*)InitAddon)
|
@@ -1,20 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#ifndef NODE_VITASTOR_ADDON_H
|
||||
#define NODE_VITASTOR_ADDON_H
|
||||
|
||||
#include <nan.h>
|
||||
#include <vitastor_c.h>
|
||||
|
||||
#include "client.h"
|
||||
|
||||
#define ERRORF(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
|
||||
|
||||
#define TRACEF(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
|
||||
#define TRACE(msg) fprintf(stderr, "%s\n", msg);
|
||||
|
||||
//#define TRACEF(format, arg) ;
|
||||
//#define TRACE(msg) ;
|
||||
|
||||
#endif
|
@@ -1,21 +0,0 @@
|
||||
{
|
||||
'targets': [
|
||||
{
|
||||
'target_name': 'addon',
|
||||
'sources': [
|
||||
'client.cc',
|
||||
'addon.cc'
|
||||
],
|
||||
'include_dirs': [
|
||||
'<!(node -e "require(\'nan\')")'
|
||||
],
|
||||
'cflags': [
|
||||
'<!(pkg-config --cflags vitastor) -g'
|
||||
],
|
||||
'libraries': [
|
||||
'<!(pkg-config --libs vitastor)',
|
||||
'-lvitastor_kv'
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -1,968 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "addon.h"
|
||||
|
||||
#define NODE_VITASTOR_READ 1
|
||||
#define NODE_VITASTOR_WRITE 2
|
||||
#define NODE_VITASTOR_SYNC 3
|
||||
#define NODE_VITASTOR_READ_BITMAP 4
|
||||
#define NODE_VITASTOR_GET_INFO 5
|
||||
|
||||
#ifndef INODE_POOL
|
||||
#define INODE_POOL(inode) (uint32_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
#define INODE_NO_POOL(inode) (uint64_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_WITH_POOL(pool_id, inode) (((uint64_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
|
||||
#endif
|
||||
|
||||
class NodeVitastorRequest: public Nan::AsyncResource
|
||||
{
|
||||
public:
|
||||
NodeVitastorRequest(NodeVitastor *cli, v8::Local<v8::Function> cb): Nan::AsyncResource("NodeVitastorRequest")
|
||||
{
|
||||
this->cli = cli;
|
||||
callback.Reset(cb);
|
||||
}
|
||||
|
||||
iovec iov;
|
||||
std::vector<iovec> iov_list;
|
||||
NodeVitastor *cli = NULL;
|
||||
NodeVitastorImage *img = NULL;
|
||||
int op = 0;
|
||||
uint64_t offset = 0, len = 0, version = 0;
|
||||
bool with_parents = false;
|
||||
Nan::Persistent<v8::Function> callback;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// NodeVitastor
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
NodeVitastor::NodeVitastor(): Nan::ObjectWrap()
|
||||
{
|
||||
TRACE("NodeVitastor: constructor");
|
||||
poll_watcher.data = this;
|
||||
}
|
||||
|
||||
NodeVitastor::~NodeVitastor()
|
||||
{
|
||||
TRACE("NodeVitastor: destructor");
|
||||
uv_poll_stop(&poll_watcher);
|
||||
vitastor_c_destroy(c);
|
||||
c = NULL;
|
||||
}
|
||||
|
||||
NAN_METHOD(NodeVitastor::Create)
|
||||
{
|
||||
TRACE("NodeVitastor::Create");
|
||||
v8::Local<v8::Object> jsParams = info[0].As<v8::Object>();
|
||||
v8::Local<v8::Array> keys = Nan::GetOwnPropertyNames(jsParams).ToLocalChecked();
|
||||
std::vector<std::string> cfg;
|
||||
for (uint32_t i = 0; i < keys->Length(); i++)
|
||||
{
|
||||
auto key = Nan::Get(keys, i).ToLocalChecked();
|
||||
cfg.push_back(std::string(*Nan::Utf8String(key)));
|
||||
cfg.push_back(std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked())));
|
||||
}
|
||||
|
||||
const char **c_cfg = new const char*[cfg.size()];
|
||||
for (size_t i = 0; i < cfg.size(); i++)
|
||||
{
|
||||
c_cfg[i] = cfg[i].c_str();
|
||||
}
|
||||
NodeVitastor* cli = new NodeVitastor();
|
||||
cli->c = vitastor_c_create_uring_json(c_cfg, cfg.size());
|
||||
delete[] c_cfg;
|
||||
|
||||
int res = vitastor_c_uring_register_eventfd(cli->c);
|
||||
if (res >= 0)
|
||||
{
|
||||
cli->eventfd = res;
|
||||
res = uv_poll_init(uv_default_loop(), &cli->poll_watcher, cli->eventfd);
|
||||
if (res >= 0)
|
||||
res = uv_poll_start(&cli->poll_watcher, UV_READABLE, on_io_readable);
|
||||
on_io_readable(&cli->poll_watcher, 0, UV_READABLE);
|
||||
}
|
||||
if (res < 0)
|
||||
{
|
||||
ERRORF("NodeVitastor: failed to create and register io_uring eventfd in libuv: %s", strerror(-cli->eventfd));
|
||||
vitastor_c_destroy(cli->c);
|
||||
cli->c = NULL;
|
||||
Nan::ThrowError("failed to create and register io_uring eventfd");
|
||||
return;
|
||||
}
|
||||
|
||||
cli->Wrap(info.This());
|
||||
info.GetReturnValue().Set(info.This());
|
||||
}
|
||||
|
||||
void NodeVitastor::on_io_readable(uv_poll_t* handle, int status, int revents)
|
||||
{
|
||||
TRACEF("NodeVitastor::on_io_readable status/revents %d %d", status, revents);
|
||||
if (revents & UV_READABLE)
|
||||
{
|
||||
NodeVitastor* self = (NodeVitastor*)handle->data;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(self->mu);
|
||||
vitastor_c_uring_handle_events(self->c);
|
||||
}
|
||||
self->run_postponed();
|
||||
}
|
||||
}
|
||||
|
||||
void NodeVitastor::run_postponed()
|
||||
{
|
||||
std::vector<std::function<void()>> callbacks;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mu);
|
||||
callbacks.swap(postponed);
|
||||
}
|
||||
for (auto & cb: callbacks)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
}
|
||||
|
||||
NodeVitastorRequest* NodeVitastor::get_read_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
||||
{
|
||||
uint64_t offset = Nan::To<int64_t>(info[argpos+0]).FromJust();
|
||||
uint64_t len = Nan::To<int64_t>(info[argpos+1]).FromJust();
|
||||
uint8_t *buf = (uint8_t*)malloc(len);
|
||||
if (!buf)
|
||||
{
|
||||
Nan::ThrowError("failed to allocate memory");
|
||||
return NULL;
|
||||
}
|
||||
v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(this, callback);
|
||||
|
||||
req->offset = offset;
|
||||
req->len = len;
|
||||
req->iov = { .iov_base = buf, .iov_len = len };
|
||||
|
||||
return req;
|
||||
}
|
||||
|
||||
// read(pool, inode, offset, len, callback(err, buffer, version))
|
||||
NAN_METHOD(NodeVitastor::Read)
|
||||
{
|
||||
TRACE("NodeVitastor::Read");
|
||||
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
|
||||
uint64_t pool = Nan::To<int64_t>(info[0]).FromJust();
|
||||
uint64_t inode = Nan::To<int64_t>(info[1]).FromJust();
|
||||
|
||||
auto req = self->get_read_request(info, 2);
|
||||
|
||||
self->Ref();
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(self->mu);
|
||||
vitastor_c_read(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, &req->iov, 1, postpone_read_finish, req);
|
||||
}
|
||||
self->run_postponed();
|
||||
}
|
||||
|
||||
NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
||||
{
|
||||
uint64_t offset = Nan::To<int64_t>(info[argpos+0]).FromJust();
|
||||
const auto & bufarg = info[argpos+1];
|
||||
uint64_t version = 0;
|
||||
if (!info[argpos+2].IsEmpty() && info[argpos+2]->IsObject())
|
||||
{
|
||||
auto key = Nan::New<v8::String>("version").ToLocalChecked();
|
||||
auto params = info[argpos+2].As<v8::Object>();
|
||||
auto versionObj = Nan::Get(params, key).ToLocalChecked();
|
||||
if (!versionObj.IsEmpty())
|
||||
version = Nan::To<int64_t>(versionObj).FromJust();
|
||||
argpos++;
|
||||
}
|
||||
|
||||
v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(this, callback);
|
||||
|
||||
req->offset = offset;
|
||||
req->version = version;
|
||||
|
||||
if (bufarg->IsArray())
|
||||
{
|
||||
auto buffers = bufarg.As<v8::Array>();
|
||||
req->len = 0;
|
||||
for (uint32_t i = 0; i < buffers->Length(); i++)
|
||||
{
|
||||
auto buffer_obj = Nan::Get(buffers, i).ToLocalChecked();
|
||||
char *buf = node::Buffer::Data(buffer_obj);
|
||||
uint64_t len = node::Buffer::Length(buffer_obj);
|
||||
req->iov_list.push_back({ .iov_base = buf, .iov_len = len });
|
||||
req->len += len;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
char *buf = node::Buffer::Data(bufarg);
|
||||
uint64_t len = node::Buffer::Length(bufarg);
|
||||
req->iov = { .iov_base = buf, .iov_len = len };
|
||||
req->len = len;
|
||||
}
|
||||
|
||||
return req;
|
||||
}
|
||||
|
||||
// write(pool, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
NAN_METHOD(NodeVitastor::Write)
|
||||
{
|
||||
TRACE("NodeVitastor::Write");
|
||||
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
|
||||
uint64_t pool = Nan::To<int64_t>(info[0]).FromJust();
|
||||
uint64_t inode = Nan::To<int64_t>(info[1]).FromJust();
|
||||
|
||||
auto req = self->get_write_request(info, 2);
|
||||
|
||||
self->Ref();
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(self->mu);
|
||||
vitastor_c_write(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, req->version,
|
||||
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
||||
req->iov_list.size() ? req->iov_list.size() : 1,
|
||||
postpone_write_finish, req);
|
||||
}
|
||||
self->run_postponed();
|
||||
}
|
||||
|
||||
// sync(callback(err))
|
||||
NAN_METHOD(NodeVitastor::Sync)
|
||||
{
|
||||
TRACE("NodeVitastor::Sync");
|
||||
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
|
||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(self, callback);
|
||||
|
||||
self->Ref();
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(self->mu);
|
||||
vitastor_c_sync(self->c, postpone_write_finish, req);
|
||||
}
|
||||
self->run_postponed();
|
||||
}
|
||||
|
||||
// read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
NAN_METHOD(NodeVitastor::ReadBitmap)
|
||||
{
|
||||
TRACE("NodeVitastor::ReadBitmap");
|
||||
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
|
||||
uint64_t pool = Nan::To<int64_t>(info[0]).FromJust();
|
||||
uint64_t inode = Nan::To<int64_t>(info[1]).FromJust();
|
||||
uint64_t offset = Nan::To<int64_t>(info[2]).FromJust();
|
||||
uint64_t len = Nan::To<int64_t>(info[3]).FromJust();
|
||||
bool with_parents = Nan::To<bool>(info[4]).FromJust();
|
||||
v8::Local<v8::Function> callback = info[5].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(self, callback);
|
||||
|
||||
self->Ref();
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(self->mu);
|
||||
vitastor_c_read_bitmap(self->c, ((pool << (64-POOL_ID_BITS)) | inode), offset, len, with_parents, postpone_read_bitmap_finish, req);
|
||||
}
|
||||
self->run_postponed();
|
||||
}
|
||||
|
||||
static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long retval)
|
||||
{
|
||||
// Legal errors: EINVAL, EIO, EROFS, ENOSPC, EINTR, ENOENT
|
||||
v8::Local<v8::Value> args[1];
|
||||
if (!retval)
|
||||
args[0] = Nan::Null();
|
||||
else
|
||||
args[0] = Nan::New<v8::Int32>((int32_t)retval);
|
||||
nanCallback.Call(1, args, req);
|
||||
}
|
||||
|
||||
void NodeVitastor::on_read_finish(void *opaque, long retval, uint64_t version)
|
||||
{
|
||||
TRACE("NodeVitastor::on_read_finish");
|
||||
Nan::HandleScope scope;
|
||||
NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
if (retval == -ENOENT)
|
||||
{
|
||||
free(req->iov.iov_base);
|
||||
nanCallback.Call(0, NULL, req);
|
||||
}
|
||||
else if (retval < 0)
|
||||
{
|
||||
free(req->iov.iov_base);
|
||||
on_error(req, nanCallback, retval);
|
||||
}
|
||||
else
|
||||
{
|
||||
v8::Local<v8::Value> args[3];
|
||||
args[0] = Nan::Null();
|
||||
args[1] = Nan::NewBuffer((char*)req->iov.iov_base, req->iov.iov_len).ToLocalChecked();
|
||||
args[2] = v8::BigInt::NewFromUnsigned(v8::Isolate::GetCurrent(), version);
|
||||
nanCallback.Call(3, args, req);
|
||||
}
|
||||
req->cli->Unref();
|
||||
delete req;
|
||||
}
|
||||
|
||||
void NodeVitastor::on_write_finish(void *opaque, long retval)
|
||||
{
|
||||
TRACE("NodeVitastor::on_write_finish");
|
||||
Nan::HandleScope scope;
|
||||
NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
on_error(req, nanCallback, retval);
|
||||
req->cli->Unref();
|
||||
delete req;
|
||||
}
|
||||
|
||||
void NodeVitastor::on_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap)
|
||||
{
|
||||
TRACE("NodeVitastor::on_read_bitmap_finish");
|
||||
Nan::HandleScope scope;
|
||||
NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
if (retval == -ENOENT)
|
||||
nanCallback.Call(0, NULL, req);
|
||||
else if (retval < 0)
|
||||
on_error(req, nanCallback, retval);
|
||||
else
|
||||
{
|
||||
v8::Local<v8::Value> args[2];
|
||||
args[0] = Nan::Null();
|
||||
args[1] = Nan::NewBuffer((char*)bitmap, (retval+7)/8).ToLocalChecked();
|
||||
nanCallback.Call(2, args, req);
|
||||
}
|
||||
req->cli->Unref();
|
||||
delete req;
|
||||
}
|
||||
|
||||
void NodeVitastor::postpone_read_finish(void *opaque, long retval, uint64_t version)
|
||||
{
|
||||
NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
|
||||
req->cli->postponed.push_back([=]() { on_read_finish(opaque, retval, version); });
|
||||
}
|
||||
|
||||
void NodeVitastor::postpone_write_finish(void *opaque, long retval)
|
||||
{
|
||||
NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
|
||||
req->cli->postponed.push_back([=]() { on_write_finish(opaque, retval); });
|
||||
}
|
||||
|
||||
void NodeVitastor::postpone_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap)
|
||||
{
|
||||
NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
|
||||
req->cli->postponed.push_back([=]() { on_read_bitmap_finish(opaque, retval, bitmap); });
|
||||
}
|
||||
|
||||
//NAN_METHOD(NodeVitastor::Destroy)
|
||||
//{
|
||||
// TRACE("NodeVitastor::Destroy");
|
||||
//}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// NodeVitastorImage
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
NAN_METHOD(NodeVitastorImage::Create)
|
||||
{
|
||||
TRACE("NodeVitastorImage::Create");
|
||||
|
||||
v8::Local<v8::Object> parent = info[0].As<v8::Object>();
|
||||
std::string name = std::string(*Nan::Utf8String(info[1].As<v8::String>()));
|
||||
NodeVitastor *cli = Nan::ObjectWrap::Unwrap<NodeVitastor>(parent);
|
||||
|
||||
NodeVitastorImage *img = new NodeVitastorImage();
|
||||
img->Wrap(info.This());
|
||||
|
||||
img->cli = cli;
|
||||
img->name = name;
|
||||
|
||||
img->Ref();
|
||||
cli->Ref();
|
||||
std::unique_lock<std::mutex> lock(cli->mu);
|
||||
vitastor_c_watch_inode(cli->c, (char*)img->name.c_str(), on_watch_start, img);
|
||||
|
||||
info.GetReturnValue().Set(info.This());
|
||||
}
|
||||
|
||||
NodeVitastorImage::~NodeVitastorImage()
|
||||
{
|
||||
if (watch)
|
||||
{
|
||||
vitastor_c_close_watch(cli->c, watch);
|
||||
watch = NULL;
|
||||
}
|
||||
cli->Unref();
|
||||
}
|
||||
|
||||
// read(offset, len, callback(err, buffer, version))
|
||||
NAN_METHOD(NodeVitastorImage::Read)
|
||||
{
|
||||
TRACE("NodeVitastorImage::Read");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
auto req = img->cli->get_read_request(info, 0);
|
||||
req->img = img;
|
||||
req->op = NODE_VITASTOR_READ;
|
||||
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
// write(offset, buffer, { version }?, callback(err))
|
||||
NAN_METHOD(NodeVitastorImage::Write)
|
||||
{
|
||||
TRACE("NodeVitastorImage::Write");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
auto req = img->cli->get_write_request(info, 0);
|
||||
req->img = img;
|
||||
req->op = NODE_VITASTOR_WRITE;
|
||||
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
NAN_METHOD(NodeVitastorImage::Sync)
|
||||
{
|
||||
TRACE("NodeVitastorImage::Sync");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(img->cli, callback);
|
||||
req->img = img;
|
||||
req->op = NODE_VITASTOR_SYNC;
|
||||
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
// read_bitmap(offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
NAN_METHOD(NodeVitastorImage::ReadBitmap)
|
||||
{
|
||||
TRACE("NodeVitastorImage::ReadBitmap");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
uint64_t offset = Nan::To<int64_t>(info[0]).FromJust();
|
||||
uint64_t len = Nan::To<int64_t>(info[1]).FromJust();
|
||||
bool with_parents = Nan::To<bool>(info[2]).FromJust();
|
||||
v8::Local<v8::Function> callback = info[3].As<v8::Function>();
|
||||
|
||||
auto req = new NodeVitastorRequest(img->cli, callback);
|
||||
req->img = img;
|
||||
req->op = NODE_VITASTOR_READ_BITMAP;
|
||||
req->offset = offset;
|
||||
req->len = len;
|
||||
req->with_parents = with_parents;
|
||||
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
NAN_METHOD(NodeVitastorImage::GetInfo)
|
||||
{
|
||||
TRACE("NodeVitastorImage::GetInfo");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(img->cli, callback);
|
||||
req->img = img;
|
||||
req->op = NODE_VITASTOR_GET_INFO;
|
||||
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
void NodeVitastorImage::exec_or_wait(NodeVitastorRequest *req)
|
||||
{
|
||||
if (!watch)
|
||||
{
|
||||
// Need to wait for initialisation
|
||||
on_init.push_back(req);
|
||||
}
|
||||
else
|
||||
{
|
||||
exec_request(req);
|
||||
}
|
||||
}
|
||||
|
||||
void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||
{
|
||||
if (req->op == NODE_VITASTOR_READ)
|
||||
{
|
||||
cli->Ref();
|
||||
std::unique_lock<std::mutex> lock(cli->mu);
|
||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||
vitastor_c_read(cli->c, ino, req->offset, req->len, &req->iov, 1, NodeVitastor::postpone_read_finish, req);
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_WRITE)
|
||||
{
|
||||
cli->Ref();
|
||||
std::unique_lock<std::mutex> lock(cli->mu);
|
||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||
vitastor_c_write(cli->c, ino, req->offset, req->len, req->version,
|
||||
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
||||
req->iov_list.size() ? req->iov_list.size() : 1,
|
||||
NodeVitastor::postpone_write_finish, req);
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_SYNC)
|
||||
{
|
||||
cli->Ref();
|
||||
std::unique_lock<std::mutex> lock(cli->mu);
|
||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||
uint32_t imm = vitastor_c_inode_get_immediate_commit(cli->c, ino);
|
||||
if (imm != IMMEDIATE_ALL)
|
||||
{
|
||||
vitastor_c_sync(cli->c, NodeVitastor::postpone_write_finish, req);
|
||||
}
|
||||
else
|
||||
{
|
||||
NodeVitastor::postpone_write_finish(req, 0);
|
||||
}
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_READ_BITMAP)
|
||||
{
|
||||
cli->Ref();
|
||||
std::unique_lock<std::mutex> lock(cli->mu);
|
||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||
vitastor_c_read_bitmap(cli->c, ino, req->offset, req->len, req->with_parents, NodeVitastor::postpone_read_bitmap_finish, req);
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_GET_INFO)
|
||||
{
|
||||
v8::Local<v8::Object> res = Nan::New<v8::Object>();
|
||||
|
||||
fill_info(res);
|
||||
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
v8::Local<v8::Value> args[1];
|
||||
args[0] = res;
|
||||
nanCallback.Call(1, args, req);
|
||||
|
||||
delete req;
|
||||
}
|
||||
cli->run_postponed();
|
||||
}
|
||||
|
||||
void NodeVitastorImage::fill_info(v8::Local<v8::Object> & res)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(cli->mu);
|
||||
uint64_t size = vitastor_c_inode_get_size(watch);
|
||||
uint64_t num = vitastor_c_inode_get_num(watch);
|
||||
uint32_t block_size = vitastor_c_inode_get_block_size(cli->c, num);
|
||||
uint32_t bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(cli->c, num);
|
||||
int readonly = vitastor_c_inode_get_readonly(watch);
|
||||
uint32_t immediate_commit = vitastor_c_inode_get_immediate_commit(cli->c, num);
|
||||
uint64_t parent_id = vitastor_c_inode_get_parent_id(watch);
|
||||
char *meta = vitastor_c_inode_get_meta(watch);
|
||||
uint64_t mod_revision = vitastor_c_inode_get_mod_revision(watch);
|
||||
|
||||
Nan::Set(res, Nan::New<v8::String>("pool_id").ToLocalChecked(), Nan::New<v8::Number>(INODE_POOL(num)));
|
||||
Nan::Set(res, Nan::New<v8::String>("inode_num").ToLocalChecked(), Nan::New<v8::Number>(INODE_NO_POOL(num)));
|
||||
if (size < ((uint64_t)1<<53))
|
||||
Nan::Set(res, Nan::New<v8::String>("size").ToLocalChecked(), Nan::New<v8::Number>(size));
|
||||
else
|
||||
Nan::Set(res, Nan::New<v8::String>("size").ToLocalChecked(), v8::BigInt::NewFromUnsigned(v8::Isolate::GetCurrent(), size));
|
||||
if (parent_id)
|
||||
{
|
||||
Nan::Set(res, Nan::New<v8::String>("parent_pool_id").ToLocalChecked(), Nan::New<v8::Number>(INODE_POOL(parent_id)));
|
||||
Nan::Set(res, Nan::New<v8::String>("parent_inode_num").ToLocalChecked(), Nan::New<v8::Number>(INODE_NO_POOL(parent_id)));
|
||||
}
|
||||
Nan::Set(res, Nan::New<v8::String>("readonly").ToLocalChecked(), Nan::New((bool)readonly));
|
||||
if (meta)
|
||||
{
|
||||
Nan::JSON nanJSON;
|
||||
Nan::Set(res, Nan::New<v8::String>("meta").ToLocalChecked(), nanJSON.Parse(Nan::New<v8::String>(meta).ToLocalChecked()).ToLocalChecked());
|
||||
}
|
||||
if (mod_revision < ((uint64_t)1<<53))
|
||||
Nan::Set(res, Nan::New<v8::String>("mod_revision").ToLocalChecked(), Nan::New<v8::Number>(mod_revision));
|
||||
else
|
||||
Nan::Set(res, Nan::New<v8::String>("mod_revision").ToLocalChecked(), v8::BigInt::NewFromUnsigned(v8::Isolate::GetCurrent(), mod_revision));
|
||||
Nan::Set(res, Nan::New<v8::String>("block_size").ToLocalChecked(), Nan::New(block_size));
|
||||
Nan::Set(res, Nan::New<v8::String>("bitmap_granularity").ToLocalChecked(), Nan::New(bitmap_granularity));
|
||||
Nan::Set(res, Nan::New<v8::String>("immediate_commit").ToLocalChecked(), Nan::New(immediate_commit));
|
||||
}
|
||||
|
||||
void NodeVitastorImage::on_watch_start(void *opaque, long retval)
|
||||
{
|
||||
NodeVitastorImage *img = (NodeVitastorImage *)opaque;
|
||||
{
|
||||
img->watch = (void*)retval;
|
||||
auto on_init = std::move(img->on_init);
|
||||
for (auto req: on_init)
|
||||
{
|
||||
img->exec_request(req);
|
||||
}
|
||||
}
|
||||
img->Unref();
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// NodeVitastorKV
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
Nan::Persistent<v8::Function> NodeVitastorKV::listing_class;
|
||||
|
||||
// constructor(node_vitastor)
|
||||
NAN_METHOD(NodeVitastorKV::Create)
|
||||
{
|
||||
TRACE("NodeVitastorKV::Create");
|
||||
|
||||
v8::Local<v8::Object> parent = info[0].As<v8::Object>();
|
||||
NodeVitastor *cli = Nan::ObjectWrap::Unwrap<NodeVitastor>(parent);
|
||||
|
||||
NodeVitastorKV *kv = new NodeVitastorKV();
|
||||
kv->cli = cli;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(cli->mu);
|
||||
kv->dbw = new vitastorkv_dbw_t((cluster_client_t*)vitastor_c_get_internal_client(cli->c));
|
||||
}
|
||||
|
||||
kv->Wrap(info.This());
|
||||
cli->Ref();
|
||||
info.GetReturnValue().Set(info.This());
|
||||
}
|
||||
|
||||
NodeVitastorKV::~NodeVitastorKV()
|
||||
{
|
||||
delete dbw;
|
||||
cli->Unref();
|
||||
}
|
||||
|
||||
// open(pool_id, inode_num, { ...config }, callback(err))
|
||||
NAN_METHOD(NodeVitastorKV::Open)
|
||||
{
|
||||
TRACE("NodeVitastorKV::Open");
|
||||
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
|
||||
uint64_t inode_id = INODE_WITH_POOL(Nan::To<int64_t>(info[0]).FromJust(), Nan::To<int64_t>(info[1]).FromJust());
|
||||
|
||||
v8::Local<v8::Object> jsParams = info[2].As<v8::Object>();
|
||||
v8::Local<v8::Array> keys = Nan::GetOwnPropertyNames(jsParams).ToLocalChecked();
|
||||
std::map<std::string, std::string> cfg;
|
||||
for (uint32_t i = 0; i < keys->Length(); i++)
|
||||
{
|
||||
auto key = Nan::Get(keys, i).ToLocalChecked();
|
||||
cfg[std::string(*Nan::Utf8String(key))] = std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked()));
|
||||
}
|
||||
|
||||
v8::Local<v8::Function> callback = info[3].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(kv->cli, callback);
|
||||
|
||||
kv->Ref();
|
||||
kv->dbw->open(inode_id, cfg, [kv, req](int res)
|
||||
{
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
v8::Local<v8::Value> args[1];
|
||||
args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
|
||||
nanCallback.Call(1, args, req);
|
||||
delete req;
|
||||
kv->Unref();
|
||||
});
|
||||
}
|
||||
|
||||
// close(callback(err))
|
||||
NAN_METHOD(NodeVitastorKV::Close)
|
||||
{
|
||||
TRACE("NodeVitastorKV::Close");
|
||||
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
|
||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(kv->cli, callback);
|
||||
|
||||
kv->Ref();
|
||||
kv->dbw->close([kv, req]()
|
||||
{
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
nanCallback.Call(0, NULL, req);
|
||||
delete req;
|
||||
kv->Unref();
|
||||
});
|
||||
}
|
||||
|
||||
// set_config({ ...config })
|
||||
NAN_METHOD(NodeVitastorKV::SetConfig)
|
||||
{
|
||||
TRACE("NodeVitastorKV::SetConfig");
|
||||
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
|
||||
v8::Local<v8::Object> jsParams = info[0].As<v8::Object>();
|
||||
v8::Local<v8::Array> keys = Nan::GetOwnPropertyNames(jsParams).ToLocalChecked();
|
||||
std::map<std::string, std::string> cfg;
|
||||
for (uint32_t i = 0; i < keys->Length(); i++)
|
||||
{
|
||||
auto key = Nan::Get(keys, i).ToLocalChecked();
|
||||
cfg[std::string(*Nan::Utf8String(key))] = std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked()));
|
||||
}
|
||||
|
||||
kv->dbw->set_config(cfg);
|
||||
}
|
||||
|
||||
// get_size()
|
||||
NAN_METHOD(NodeVitastorKV::GetSize)
|
||||
{
|
||||
TRACE("NodeVitastorKV::GetSize");
|
||||
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
|
||||
auto size = kv->dbw->get_size();
|
||||
info.GetReturnValue().Set((size < ((uint64_t)1<<53))
|
||||
? v8::Local<v8::Value>(Nan::New<v8::Number>(size))
|
||||
: v8::Local<v8::Value>(v8::BigInt::NewFromUnsigned(info.GetIsolate(), size)));
|
||||
}
|
||||
|
||||
void NodeVitastorKV::get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info, bool allow_cache)
|
||||
{
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
|
||||
// FIXME: Handle Buffer too
|
||||
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
||||
|
||||
v8::Local<v8::Function> callback = info[1].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(kv->cli, callback);
|
||||
|
||||
kv->Ref();
|
||||
kv->dbw->get(key, [kv, req](int res, const std::string & value)
|
||||
{
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
v8::Local<v8::Value> args[2];
|
||||
args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
|
||||
args[1] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(value).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
|
||||
nanCallback.Call(2, args, req);
|
||||
delete req;
|
||||
kv->Unref();
|
||||
}, allow_cache);
|
||||
}
|
||||
|
||||
// get(key, callback(err, value))
|
||||
NAN_METHOD(NodeVitastorKV::Get)
|
||||
{
|
||||
TRACE("NodeVitastorKV::Get");
|
||||
get_impl(info, false);
|
||||
}
|
||||
|
||||
// get_cached(key, callback(err, value))
|
||||
NAN_METHOD(NodeVitastorKV::GetCached)
|
||||
{
|
||||
TRACE("NodeVitastorKV::GetCached");
|
||||
get_impl(info, true);
|
||||
}
|
||||
|
||||
static std::function<bool(int, const std::string &)> make_cas_callback(NodeVitastorRequest *cas_req)
|
||||
{
|
||||
return [cas_req](int res, const std::string & value)
|
||||
{
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(cas_req->callback));
|
||||
v8::Local<v8::Value> args[1];
|
||||
args[0] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(value).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
|
||||
Nan::MaybeLocal<v8::Value> ret = nanCallback.Call(1, args, cas_req);
|
||||
if (ret.IsEmpty())
|
||||
return false;
|
||||
return Nan::To<bool>(ret.ToLocalChecked()).FromJust();
|
||||
};
|
||||
}
|
||||
|
||||
// set(key, value, callback(err), cas_compare(old_value))
|
||||
NAN_METHOD(NodeVitastorKV::Set)
|
||||
{
|
||||
TRACE("NodeVitastorKV::Set");
|
||||
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
|
||||
// FIXME: Handle Buffer too
|
||||
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
||||
std::string value(*Nan::Utf8String(info[1].As<v8::String>()));
|
||||
|
||||
v8::Local<v8::Function> callback = info[2].As<v8::Function>();
|
||||
NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
|
||||
|
||||
std::function<bool(int, const std::string &)> cas_cb;
|
||||
if (info.Length() > 3 && info[3]->IsObject())
|
||||
{
|
||||
v8::Local<v8::Function> cas_callback = info[3].As<v8::Function>();
|
||||
cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
|
||||
cas_cb = make_cas_callback(cas_req);
|
||||
}
|
||||
|
||||
kv->Ref();
|
||||
kv->dbw->set(key, value, [kv, req, cas_req](int res)
|
||||
{
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
v8::Local<v8::Value> args[1];
|
||||
args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
|
||||
nanCallback.Call(1, args, req);
|
||||
delete req;
|
||||
if (cas_req)
|
||||
delete cas_req;
|
||||
kv->Unref();
|
||||
}, cas_cb);
|
||||
}
|
||||
|
||||
// del(key, callback(err), cas_compare(old_value))
|
||||
NAN_METHOD(NodeVitastorKV::Del)
|
||||
{
|
||||
TRACE("NodeVitastorKV::Del");
|
||||
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
|
||||
// FIXME: Handle Buffer too
|
||||
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
||||
|
||||
v8::Local<v8::Function> callback = info[1].As<v8::Function>();
|
||||
NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
|
||||
|
||||
std::function<bool(int, const std::string &)> cas_cb;
|
||||
if (info.Length() > 2 && info[2]->IsObject())
|
||||
{
|
||||
v8::Local<v8::Function> cas_callback = info[2].As<v8::Function>();
|
||||
cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
|
||||
cas_cb = make_cas_callback(cas_req);
|
||||
}
|
||||
|
||||
kv->Ref();
|
||||
kv->dbw->del(key, [kv, req, cas_req](int res)
|
||||
{
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
v8::Local<v8::Value> args[1];
|
||||
args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
|
||||
nanCallback.Call(1, args, req);
|
||||
delete req;
|
||||
if (cas_req)
|
||||
delete cas_req;
|
||||
kv->Unref();
|
||||
}, cas_cb);
|
||||
}
|
||||
|
||||
// list(start_key?)
|
||||
NAN_METHOD(NodeVitastorKV::List)
|
||||
{
|
||||
TRACE("NodeVitastorKV::List");
|
||||
|
||||
v8::Local<v8::Function> cons = Nan::New(listing_class);
|
||||
v8::Local<v8::Value> args[2];
|
||||
args[0] = info.This();
|
||||
int narg = 1;
|
||||
if (info.Length() > 1 && info[1]->IsString())
|
||||
{
|
||||
args[1] = info[1];
|
||||
narg = 2;
|
||||
}
|
||||
info.GetReturnValue().Set(Nan::NewInstance(cons, narg, args).ToLocalChecked());
|
||||
}
|
||||
|
||||
/*NAN_METHOD(NodeVitastorKV::Destroy)
|
||||
{
|
||||
TRACE("NodeVitastorKV::Destroy");
|
||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||
std::unique_lock<std::mutex> lock(self->mu);
|
||||
if (!kv->dead)
|
||||
kv->Unref();
|
||||
}*/
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// NodeVitastorKVListing
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// constructor(node_vitastor_kv, start_key?)
|
||||
NAN_METHOD(NodeVitastorKVListing::Create)
|
||||
{
|
||||
TRACE("NodeVitastorKVListing::Create");
|
||||
|
||||
v8::Local<v8::Object> parent = info[0].As<v8::Object>();
|
||||
NodeVitastorKV *kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(parent);
|
||||
|
||||
std::string start_key;
|
||||
// FIXME: Handle Buffer too
|
||||
if (info.Length() > 1 && info[1]->IsString())
|
||||
{
|
||||
start_key = std::string(*Nan::Utf8String(info[1].As<v8::String>()));
|
||||
}
|
||||
|
||||
NodeVitastorKVListing *list = new NodeVitastorKVListing();
|
||||
list->kv = kv;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(kv->cli->mu);
|
||||
list->handle = list->kv->dbw->list_start(start_key);
|
||||
}
|
||||
|
||||
list->Wrap(info.This());
|
||||
kv->Ref();
|
||||
info.GetReturnValue().Set(info.This());
|
||||
}
|
||||
|
||||
NodeVitastorKVListing::~NodeVitastorKVListing()
|
||||
{
|
||||
if (handle)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(kv->cli->mu);
|
||||
kv->dbw->list_close(handle);
|
||||
handle = NULL;
|
||||
}
|
||||
kv->Unref();
|
||||
}
|
||||
|
||||
// next(callback(err, value))
|
||||
NAN_METHOD(NodeVitastorKVListing::Next)
|
||||
{
|
||||
TRACE("NodeVitastorKVListing::Next");
|
||||
|
||||
NodeVitastorKVListing* list = Nan::ObjectWrap::Unwrap<NodeVitastorKVListing>(info.This());
|
||||
|
||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(list->kv->cli, callback);
|
||||
if (!list->handle)
|
||||
{
|
||||
// Already closed
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
v8::Local<v8::Value> args[1];
|
||||
args[0] = Nan::New<v8::Int32>(-EINVAL);
|
||||
nanCallback.Call(1, args, req);
|
||||
delete req;
|
||||
return;
|
||||
}
|
||||
|
||||
list->kv->Ref();
|
||||
list->kv->dbw->list_next(list->handle, [list, req](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
v8::Local<v8::Value> args[3];
|
||||
args[0] = Nan::New<v8::Int32>(res);
|
||||
args[1] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(key).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
|
||||
args[2] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(value).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
|
||||
nanCallback.Call(3, args, req);
|
||||
delete req;
|
||||
list->kv->Unref();
|
||||
});
|
||||
}
|
||||
|
||||
// close()
|
||||
NAN_METHOD(NodeVitastorKVListing::Close)
|
||||
{
|
||||
TRACE("NodeVitastorKVListing::Close");
|
||||
|
||||
NodeVitastorKVListing* list = Nan::ObjectWrap::Unwrap<NodeVitastorKVListing>(info.This());
|
||||
|
||||
if (list->handle)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(list->kv->cli->mu);
|
||||
list->kv->dbw->list_close(list->handle);
|
||||
list->handle = NULL;
|
||||
}
|
||||
}
|
@@ -1,146 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#ifndef NODE_VITASTOR_CLIENT_H
|
||||
#define NODE_VITASTOR_CLIENT_H
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <nan.h>
|
||||
#include <vitastor_c.h>
|
||||
#include <vitastor_kv.h>
|
||||
|
||||
class NodeVitastorRequest;
|
||||
|
||||
class NodeVitastor: public Nan::ObjectWrap
|
||||
{
|
||||
public:
|
||||
// constructor({ ...config })
|
||||
static NAN_METHOD(Create);
|
||||
// read(pool, inode, offset, len, callback(err, buffer, version))
|
||||
static NAN_METHOD(Read);
|
||||
// write(pool, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
static NAN_METHOD(Write);
|
||||
// sync(callback(err))
|
||||
static NAN_METHOD(Sync);
|
||||
// read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
static NAN_METHOD(ReadBitmap);
|
||||
// // destroy()
|
||||
// static NAN_METHOD(Destroy);
|
||||
|
||||
~NodeVitastor();
|
||||
|
||||
private:
|
||||
vitastor_c *c = NULL;
|
||||
int eventfd = -1;
|
||||
uv_poll_t poll_watcher;
|
||||
// FIXME: Is it really needed?
|
||||
std::mutex mu;
|
||||
std::vector<std::function<void()>> postponed;
|
||||
|
||||
NodeVitastor();
|
||||
|
||||
static void on_io_readable(uv_poll_t* handle, int status, int revents);
|
||||
static void on_read_finish(void *opaque, long retval, uint64_t version);
|
||||
static void on_write_finish(void *opaque, long retval);
|
||||
static void on_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap);
|
||||
|
||||
void run_postponed();
|
||||
static void postpone_read_finish(void *opaque, long retval, uint64_t version);
|
||||
static void postpone_write_finish(void *opaque, long retval);
|
||||
static void postpone_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap);
|
||||
|
||||
NodeVitastorRequest* get_read_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos);
|
||||
NodeVitastorRequest* get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos);
|
||||
|
||||
friend class NodeVitastorImage;
|
||||
friend class NodeVitastorKV;
|
||||
friend class NodeVitastorKVListing;
|
||||
};
|
||||
|
||||
class NodeVitastorImage: public Nan::ObjectWrap
|
||||
{
|
||||
public:
|
||||
// constructor(node_vitastor, name)
|
||||
static NAN_METHOD(Create);
|
||||
// read(offset, len, callback(err, buffer, version))
|
||||
static NAN_METHOD(Read);
|
||||
// write(offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
static NAN_METHOD(Write);
|
||||
// sync(callback(err))
|
||||
static NAN_METHOD(Sync);
|
||||
// read_bitmap(offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
static NAN_METHOD(ReadBitmap);
|
||||
// get_info(callback({ num, name, size, parent_id?, readonly?, meta?, mod_revision, block_size, bitmap_granularity, immediate_commit }))
|
||||
static NAN_METHOD(GetInfo);
|
||||
|
||||
~NodeVitastorImage();
|
||||
|
||||
private:
|
||||
NodeVitastor *cli = NULL;
|
||||
std::string name;
|
||||
void *watch = NULL;
|
||||
std::vector<NodeVitastorRequest*> on_init;
|
||||
Nan::Persistent<v8::Object> cliObj;
|
||||
|
||||
static void on_watch_start(void *opaque, long retval);
|
||||
void exec_request(NodeVitastorRequest *req);
|
||||
void exec_or_wait(NodeVitastorRequest *req);
|
||||
void fill_info(v8::Local<v8::Object> & res);
|
||||
};
|
||||
|
||||
class NodeVitastorKV: public Nan::ObjectWrap
|
||||
{
|
||||
public:
|
||||
// constructor(node_vitastor)
|
||||
static NAN_METHOD(Create);
|
||||
// open(pool_id, inode_num, { ...config }, callback(err))
|
||||
static NAN_METHOD(Open);
|
||||
// set_config({ ...config })
|
||||
static NAN_METHOD(SetConfig);
|
||||
// close(callback())
|
||||
static NAN_METHOD(Close);
|
||||
// get_size()
|
||||
static NAN_METHOD(GetSize);
|
||||
// get(key, callback(err, value))
|
||||
static NAN_METHOD(Get);
|
||||
// get_cached(key, callback(err, value))
|
||||
static NAN_METHOD(GetCached);
|
||||
// set(key, value, callback(err), cas_compare(old_value))
|
||||
static NAN_METHOD(Set);
|
||||
// del(key, callback(err), cas_compare(old_value))
|
||||
static NAN_METHOD(Del);
|
||||
// list(start_key?)
|
||||
static NAN_METHOD(List);
|
||||
|
||||
~NodeVitastorKV();
|
||||
|
||||
static Nan::Persistent<v8::Function> listing_class;
|
||||
|
||||
private:
|
||||
NodeVitastor *cli = NULL;
|
||||
vitastorkv_dbw_t *dbw = NULL;
|
||||
|
||||
static void get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info, bool allow_cache);
|
||||
|
||||
friend class NodeVitastorKVListing;
|
||||
};
|
||||
|
||||
class NodeVitastorKVListing: public Nan::ObjectWrap
|
||||
{
|
||||
public:
|
||||
// constructor(node_vitastor_kv, start_key?)
|
||||
static NAN_METHOD(Create);
|
||||
// next(callback(err, value))
|
||||
static NAN_METHOD(Next);
|
||||
// close()
|
||||
static NAN_METHOD(Close);
|
||||
|
||||
~NodeVitastorKVListing();
|
||||
|
||||
private:
|
||||
NodeVitastorKV *kv = NULL;
|
||||
void *handle = NULL;
|
||||
};
|
||||
|
||||
#endif
|
@@ -1 +0,0 @@
|
||||
module.exports = require('bindings')('addon.node');
|
@@ -1,24 +0,0 @@
|
||||
{
|
||||
"name": "vitastor",
|
||||
"version": "1.7.0",
|
||||
"description": "Low-level native bindings to Vitastor client library",
|
||||
"main": "index.js",
|
||||
"keywords": [
|
||||
"storage",
|
||||
"sds",
|
||||
"vitastor"
|
||||
],
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git://git.yourcmc.ru/vitalif/vitastor.git"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "node-gyp rebuild"
|
||||
},
|
||||
"author": "Vitaliy Filippov",
|
||||
"license": "VNPL-2.0",
|
||||
"dependencies": {
|
||||
"bindings": "1.5.0",
|
||||
"nan": "^2.19.0"
|
||||
}
|
||||
}
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VITASTOR_VERSION = '1.7.1'
|
||||
VERSION = '1.6.1'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@@ -238,7 +238,7 @@ class VitastorDriver(driver.CloneableImageVD,
|
||||
|
||||
stats = {
|
||||
'vendor_name': 'Vitastor',
|
||||
'driver_version': VITASTOR_VERSION,
|
||||
'driver_version': self.VERSION,
|
||||
'storage_protocol': 'vitastor',
|
||||
'total_capacity_gb': 'unknown',
|
||||
'free_capacity_gb': 'unknown',
|
||||
|
@@ -71,7 +71,7 @@ index c9baeda639..85e1df5a56 100644
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index ca390c5700..d2dbaeb279 100644
|
||||
index ca390c5700..8f11ae9fa5 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -3201,7 +3201,7 @@
|
||||
@@ -120,7 +120,7 @@ index ca390c5700..d2dbaeb279 100644
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5148,6 +5171,20 @@
|
||||
@@ -5148,6 +5171,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
@@ -129,9 +129,6 @@ index ca390c5700..d2dbaeb279 100644
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @location: Where to store the new image file. This location cannot
|
||||
+# point to a snapshot.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
@@ -141,7 +138,7 @@ index ca390c5700..d2dbaeb279 100644
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5370,6 +5407,7 @@
|
||||
@@ -5370,6 +5404,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
|
@@ -71,7 +71,7 @@ index 0a99a059ec..16dc440118 100644
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index 746d1694c2..199a146a0b 100644
|
||||
index 746d1694c2..fb7aa4423b 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -3203,7 +3203,7 @@
|
||||
@@ -120,7 +120,7 @@ index 746d1694c2..199a146a0b 100644
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5180,6 +5203,20 @@
|
||||
@@ -5180,6 +5203,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
@@ -129,9 +129,6 @@ index 746d1694c2..199a146a0b 100644
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @location: Where to store the new image file. This location cannot
|
||||
+# point to a snapshot.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
@@ -141,7 +138,7 @@ index 746d1694c2..199a146a0b 100644
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5402,6 +5439,7 @@
|
||||
@@ -5402,6 +5436,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
|
@@ -18,11 +18,10 @@ fi
|
||||
cd ~/rpmbuild/SPECS
|
||||
rpmbuild -bp fio.spec
|
||||
cd $VITASTOR
|
||||
VER=$(grep ^Version: rpm/vitastor-el7.spec | awk '{print $2}')
|
||||
ln -s ~/rpmbuild/BUILD/fio*/ fio
|
||||
sh copy-fio-includes.sh
|
||||
rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-1.6.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.6.1$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -36,8 +36,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
VER=$(grep ^Version: vitastor-el7.spec | awk '{print $2}'); \
|
||||
cp /root/vitastor-$VER.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.6.1.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.7.1
|
||||
Version: 1.6.1
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.7.1.el7.tar.gz
|
||||
Source0: vitastor-1.6.1.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -144,8 +144,6 @@ mkdir -p /etc/vitastor
|
||||
groupadd -r -f vitastor 2>/dev/null ||:
|
||||
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
|
||||
mkdir -p /etc/vitastor
|
||||
mkdir -p /var/lib/vitastor
|
||||
chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
|
||||
%files -n vitastor-client
|
||||
@@ -163,7 +161,6 @@ chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
%files -n vitastor-client-devel
|
||||
%_includedir/vitastor_c.h
|
||||
%_includedir/vitastor_kv.h
|
||||
%_libdir/pkgconfig
|
||||
|
||||
|
||||
|
@@ -35,8 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
VER=$(grep ^Version: vitastor-el8.spec | awk '{print $2}'); \
|
||||
cp /root/vitastor-$VER.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.6.1.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.7.1
|
||||
Version: 1.6.1
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.7.1.el8.tar.gz
|
||||
Source0: vitastor-1.6.1.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -141,8 +141,6 @@ mkdir -p /etc/vitastor
|
||||
groupadd -r -f vitastor 2>/dev/null ||:
|
||||
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
|
||||
mkdir -p /etc/vitastor
|
||||
mkdir -p /var/lib/vitastor
|
||||
chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
|
||||
%files -n vitastor-client
|
||||
@@ -160,7 +158,6 @@ chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
%files -n vitastor-client-devel
|
||||
%_includedir/vitastor_c.h
|
||||
%_includedir/vitastor_kv.h
|
||||
%_libdir/pkgconfig
|
||||
|
||||
|
||||
|
@@ -18,8 +18,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
VER=$(grep ^Version: vitastor-el9.spec | awk '{print $2}'); \
|
||||
cp /root/vitastor-$VER.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.6.1.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.7.1
|
||||
Version: 1.6.1
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.7.1.el9.tar.gz
|
||||
Source0: vitastor-1.6.1.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -134,8 +134,6 @@ mkdir -p /etc/vitastor
|
||||
groupadd -r -f vitastor 2>/dev/null ||:
|
||||
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
|
||||
mkdir -p /etc/vitastor
|
||||
mkdir -p /var/lib/vitastor
|
||||
chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
|
||||
%files -n vitastor-client
|
||||
@@ -153,7 +151,6 @@ chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
%files -n vitastor-client-devel
|
||||
%_includedir/vitastor_c.h
|
||||
%_includedir/vitastor_kv.h
|
||||
%_libdir/pkgconfig
|
||||
|
||||
|
||||
|
@@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVITASTOR_VERSION="1.7.1")
|
||||
add_definitions(-DVERSION="1.6.1")
|
||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
if (${WITH_ASAN})
|
||||
|
@@ -13,7 +13,7 @@ target_link_libraries(vitastor_blk
|
||||
# for timerfd_manager
|
||||
vitastor_common
|
||||
)
|
||||
set_target_properties(vitastor_blk PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
|
||||
set_target_properties(vitastor_blk PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||
|
||||
if (${WITH_FIO})
|
||||
# libfio_vitastor_blk.so
|
||||
|
@@ -12,7 +12,6 @@ add_library(vitastor_common STATIC
|
||||
msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ../util/ringloop.cpp ../../json11/json11.cpp
|
||||
http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ${MSGR_RDMA}
|
||||
)
|
||||
target_link_libraries(vitastor_common pthread)
|
||||
target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||
|
||||
# libvitastor_client.so
|
||||
@@ -29,7 +28,7 @@ target_link_libraries(vitastor_client
|
||||
${LIBURING_LIBRARIES}
|
||||
${IBVERBS_LIBRARIES}
|
||||
)
|
||||
set_target_properties(vitastor_client PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
|
||||
set_target_properties(vitastor_client PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||
configure_file(vitastor.pc.in vitastor.pc @ONLY)
|
||||
|
||||
if (${WITH_FIO})
|
||||
|
@@ -452,10 +452,11 @@ void cluster_client_t::on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_nu
|
||||
if (pg_cfg.cur_primary != prev_primary)
|
||||
{
|
||||
// Repeat this PG operations because an OSD which stopped being primary may not fsync operations
|
||||
wb->repeat_ops_for(this, 0, pool_id, pg_num);
|
||||
if (wb->repeat_ops_for(this, 0, pool_id, pg_num) > 0)
|
||||
{
|
||||
continue_ops();
|
||||
}
|
||||
}
|
||||
// Always continue to resume operations hung because of lack of the primary OSD
|
||||
continue_ops();
|
||||
}
|
||||
|
||||
bool cluster_client_t::get_immediate_commit(uint64_t inode)
|
||||
@@ -1065,11 +1066,11 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
!pg_it->second.pause && pg_it->second.cur_primary)
|
||||
{
|
||||
osd_num_t primary_osd = pg_it->second.cur_primary;
|
||||
part->osd_num = primary_osd;
|
||||
auto peer_it = msgr.osd_peer_fds.find(primary_osd);
|
||||
if (peer_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
int peer_fd = peer_it->second;
|
||||
part->osd_num = primary_osd;
|
||||
part->flags |= PART_SENT;
|
||||
op->inflight_count++;
|
||||
uint64_t pg_bitmap_size = (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8) * (
|
||||
|
@@ -333,10 +333,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
etcd_watch_ws = NULL;
|
||||
}
|
||||
if (this->log_level > 1)
|
||||
{
|
||||
fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %ju/%ju/%ju\n", etcd_address.c_str(),
|
||||
etcd_watch_revision_config, etcd_watch_revision_osd, etcd_watch_revision_pg);
|
||||
}
|
||||
fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %ju\n", etcd_address.c_str(), etcd_watch_revision);
|
||||
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
|
||||
[this, cur_addr = selected_etcd_address](const http_response_t *msg)
|
||||
{
|
||||
@@ -351,20 +348,16 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
}
|
||||
else
|
||||
{
|
||||
uint64_t watch_id = data["result"]["watch_id"].uint64_value();
|
||||
if (data["result"]["created"].bool_value())
|
||||
{
|
||||
uint64_t watch_id = data["result"]["watch_id"].uint64_value();
|
||||
if (watch_id == ETCD_CONFIG_WATCH_ID ||
|
||||
watch_id == ETCD_PG_STATE_WATCH_ID ||
|
||||
watch_id == ETCD_PG_HISTORY_WATCH_ID ||
|
||||
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||
{
|
||||
etcd_watches_initialised++;
|
||||
}
|
||||
if (etcd_watches_initialised == ETCD_TOTAL_WATCHES && this->log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s, revision %ju/%ju/%ju\n", cur_addr.c_str(),
|
||||
etcd_watch_revision_config, etcd_watch_revision_osd, etcd_watch_revision_pg);
|
||||
}
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s, revision %ju\n", cur_addr.c_str(), etcd_watch_revision);
|
||||
}
|
||||
if (data["result"]["canceled"].bool_value())
|
||||
{
|
||||
@@ -382,7 +375,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
data["result"]["compact_revision"].uint64_value());
|
||||
http_close(etcd_watch_ws);
|
||||
etcd_watch_ws = NULL;
|
||||
etcd_watch_revision_config = etcd_watch_revision_osd = etcd_watch_revision_pg = 0;
|
||||
etcd_watch_revision = 0;
|
||||
on_reload_hook();
|
||||
}
|
||||
return;
|
||||
@@ -400,29 +393,13 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// Save revision only if it's present in the message - because sometimes etcd sends something without a header, like:
|
||||
// {"error": {"grpc_code": 14, "http_code": 503, "http_status": "Service Unavailable", "message": "error reading from server: EOF"}}
|
||||
if (etcd_watches_initialised == ETCD_TOTAL_WATCHES && !data["result"]["header"]["revision"].is_null())
|
||||
{
|
||||
// Restart watchers from the same revision number as in the last received message,
|
||||
// not from the next one to protect against revision being split into multiple messages,
|
||||
// even though etcd guarantees not to do that **within a single watcher** without fragment=true:
|
||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#watch-apis
|
||||
// Revision contents are ALWAYS split into separate messages for different watchers though!
|
||||
// So generally we have to resume each watcher from its own revision...
|
||||
// Progress messages may have watch_id=-1 if sent on behalf of multiple watchers though.
|
||||
// And antietcd has an advanced semantic which merges the same revision for all watchers
|
||||
// into one message and just omits watch_id.
|
||||
// So we also have to handle the case where watch_id is -1 or not present (0).
|
||||
auto watch_rev = data["result"]["header"]["revision"].uint64_value();
|
||||
if (!watch_id || watch_id == UINT64_MAX)
|
||||
etcd_watch_revision_config = etcd_watch_revision_osd = etcd_watch_revision_pg = watch_rev;
|
||||
else if (watch_id == ETCD_CONFIG_WATCH_ID)
|
||||
etcd_watch_revision_config = watch_rev;
|
||||
else if (watch_id == ETCD_PG_STATE_WATCH_ID)
|
||||
etcd_watch_revision_pg = watch_rev;
|
||||
else if (watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||
etcd_watch_revision_osd = watch_rev;
|
||||
// Protect against a revision beign split into multiple messages and some
|
||||
// of them being lost. Even though I'm not sure if etcd actually splits them
|
||||
// Also sometimes etcd sends something without a header, like:
|
||||
// {"error": {"grpc_code": 14, "http_code": 503, "http_status": "Service Unavailable", "message": "error reading from server: EOF"}}
|
||||
etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
|
||||
addresses_to_try.clear();
|
||||
}
|
||||
// First gather all changes into a hash to remove multiple overwrites
|
||||
@@ -480,7 +457,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
{ "create_request", json11::Json::object {
|
||||
{ "key", base64_encode(etcd_prefix+"/config/") },
|
||||
{ "range_end", base64_encode(etcd_prefix+"/config0") },
|
||||
{ "start_revision", etcd_watch_revision_config },
|
||||
{ "start_revision", etcd_watch_revision },
|
||||
{ "watch_id", ETCD_CONFIG_WATCH_ID },
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
@@ -489,21 +466,29 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
{ "create_request", json11::Json::object {
|
||||
{ "key", base64_encode(etcd_prefix+"/osd/state/") },
|
||||
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") },
|
||||
{ "start_revision", etcd_watch_revision_osd },
|
||||
{ "start_revision", etcd_watch_revision },
|
||||
{ "watch_id", ETCD_OSD_STATE_WATCH_ID },
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
}).dump());
|
||||
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
|
||||
{ "create_request", json11::Json::object {
|
||||
{ "key", base64_encode(etcd_prefix+"/pg/") },
|
||||
{ "range_end", base64_encode(etcd_prefix+"/pg0") },
|
||||
{ "start_revision", etcd_watch_revision_pg },
|
||||
{ "key", base64_encode(etcd_prefix+"/pg/state/") },
|
||||
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") },
|
||||
{ "start_revision", etcd_watch_revision },
|
||||
{ "watch_id", ETCD_PG_STATE_WATCH_ID },
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
}).dump());
|
||||
// FIXME: Do not watch /pg/history/ at all in client code (not in OSD)
|
||||
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
|
||||
{ "create_request", json11::Json::object {
|
||||
{ "key", base64_encode(etcd_prefix+"/pg/history/") },
|
||||
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") },
|
||||
{ "start_revision", etcd_watch_revision },
|
||||
{ "watch_id", ETCD_PG_HISTORY_WATCH_ID },
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
}).dump());
|
||||
if (on_start_watcher_hook)
|
||||
{
|
||||
on_start_watcher_hook(etcd_watch_ws);
|
||||
@@ -588,7 +573,7 @@ void etcd_state_client_t::load_global_config()
|
||||
{
|
||||
global_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||
}
|
||||
global_immediate_commit = parse_immediate_commit(global_config["immediate_commit"].string_value(), IMMEDIATE_ALL);
|
||||
global_immediate_commit = parse_immediate_commit(global_config["immediate_commit"].string_value());
|
||||
on_load_config_hook(global_config);
|
||||
});
|
||||
}
|
||||
@@ -606,11 +591,6 @@ void etcd_state_client_t::load_pgs()
|
||||
{ "key", base64_encode(etcd_prefix+"/config/pgs") },
|
||||
} }
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(etcd_prefix+"/pg/config") },
|
||||
} }
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(etcd_prefix+"/config/inode/") },
|
||||
@@ -660,10 +640,13 @@ void etcd_state_client_t::load_pgs()
|
||||
return;
|
||||
}
|
||||
reset_pg_exists();
|
||||
etcd_watch_revision_config = etcd_watch_revision_osd = etcd_watch_revision_pg = data["header"]["revision"].uint64_value()+1;
|
||||
if (this->log_level > 3)
|
||||
if (!etcd_watch_revision)
|
||||
{
|
||||
fprintf(stderr, "Loaded revision %ju of PG configuration\n", etcd_watch_revision_pg-1);
|
||||
etcd_watch_revision = data["header"]["revision"].uint64_value()+1;
|
||||
if (this->log_level > 3)
|
||||
{
|
||||
fprintf(stderr, "Loaded revision %ju of PG configuration\n", etcd_watch_revision-1);
|
||||
}
|
||||
}
|
||||
for (auto & res: data["responses"].array_items())
|
||||
{
|
||||
@@ -730,7 +713,7 @@ void etcd_state_client_t::clean_nonexistent_pgs()
|
||||
{
|
||||
if (!pg_cfg.state_exists)
|
||||
{
|
||||
if (this->log_level > 3 && (pg_cfg.cur_primary || pg_cfg.cur_state))
|
||||
if (this->log_level > 3)
|
||||
{
|
||||
fprintf(stderr, "PG %u/%u primary OSD disappeared after reload, forgetting it\n", pool_item.first, pg_it->first);
|
||||
}
|
||||
@@ -740,7 +723,7 @@ void etcd_state_client_t::clean_nonexistent_pgs()
|
||||
}
|
||||
if (!pg_cfg.history_exists)
|
||||
{
|
||||
if (this->log_level > 3 && (pg_cfg.target_history.size() || pg_cfg.all_peers.size() || pg_cfg.epoch || pg_cfg.next_scrub))
|
||||
if (this->log_level > 3)
|
||||
{
|
||||
fprintf(stderr, "PG %u/%u history disappeared after reload, forgetting it\n", pool_item.first, pg_it->first);
|
||||
}
|
||||
@@ -884,7 +867,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
pc.used_for_fs = pool_item.second["used_for_fs"].as_string();
|
||||
// Immediate Commit Mode
|
||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
||||
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value())
|
||||
: global_immediate_commit;
|
||||
// PG Stripe Size
|
||||
pc.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
|
||||
@@ -912,17 +895,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
on_change_pool_config_hook();
|
||||
}
|
||||
}
|
||||
else if (key == etcd_prefix+"/pg/config" || key == etcd_prefix+"/config/pgs")
|
||||
else if (key == etcd_prefix+"/config/pgs")
|
||||
{
|
||||
if (key == etcd_prefix+"/pg/config")
|
||||
{
|
||||
new_pg_config = !value.is_null();
|
||||
}
|
||||
else if (new_pg_config)
|
||||
{
|
||||
// Ignore old key if the new one is present
|
||||
return;
|
||||
}
|
||||
for (auto & pool_item: this->pool_config)
|
||||
{
|
||||
for (auto & pg_item: pool_item.second.pg_config)
|
||||
@@ -1201,11 +1175,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t etcd_state_client_t::parse_immediate_commit(const std::string & immediate_commit_str, uint32_t default_value)
|
||||
uint32_t etcd_state_client_t::parse_immediate_commit(const std::string & immediate_commit_str)
|
||||
{
|
||||
return (immediate_commit_str == "all" ? IMMEDIATE_ALL :
|
||||
(immediate_commit_str == "small" ? IMMEDIATE_SMALL :
|
||||
(immediate_commit_str == "none" ? IMMEDIATE_NONE : default_value)));
|
||||
return immediate_commit_str == "all" ? IMMEDIATE_ALL :
|
||||
(immediate_commit_str == "small" ? IMMEDIATE_SMALL : IMMEDIATE_NONE);
|
||||
}
|
||||
|
||||
uint32_t etcd_state_client_t::parse_scheme(const std::string & scheme)
|
||||
|
@@ -10,9 +10,10 @@
|
||||
#include "timerfd_manager.h"
|
||||
|
||||
#define ETCD_CONFIG_WATCH_ID 1
|
||||
#define ETCD_OSD_STATE_WATCH_ID 2
|
||||
#define ETCD_PG_STATE_WATCH_ID 3
|
||||
#define ETCD_TOTAL_WATCHES 3
|
||||
#define ETCD_PG_STATE_WATCH_ID 2
|
||||
#define ETCD_PG_HISTORY_WATCH_ID 3
|
||||
#define ETCD_OSD_STATE_WATCH_ID 4
|
||||
#define ETCD_TOTAL_WATCHES 4
|
||||
|
||||
#define DEFAULT_BLOCK_SIZE 128*1024
|
||||
#define MIN_DATA_BLOCK_SIZE 4*1024
|
||||
@@ -94,7 +95,7 @@ protected:
|
||||
std::string selected_etcd_address;
|
||||
std::vector<std::string> addresses_to_try;
|
||||
std::vector<inode_watch_t*> watches;
|
||||
bool new_pg_config = false;
|
||||
http_co_t *etcd_watch_ws = NULL, *keepalive_client = NULL;
|
||||
int ws_keepalive_timer = -1;
|
||||
int ws_alive = 0;
|
||||
bool rand_initialized = false;
|
||||
@@ -114,11 +115,8 @@ public:
|
||||
int log_level = 0;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
|
||||
http_co_t *etcd_watch_ws = NULL, *keepalive_client = NULL;
|
||||
int etcd_watches_initialised = 0;
|
||||
uint64_t etcd_watch_revision_config = 0;
|
||||
uint64_t etcd_watch_revision_osd = 0;
|
||||
uint64_t etcd_watch_revision_pg = 0;
|
||||
uint64_t etcd_watch_revision = 0;
|
||||
std::map<pool_id_t, pool_config_t> pool_config;
|
||||
std::map<osd_num_t, json11::Json> peer_states;
|
||||
std::set<osd_num_t> seen_peers;
|
||||
@@ -159,6 +157,6 @@ public:
|
||||
int address_count();
|
||||
~etcd_state_client_t();
|
||||
|
||||
static uint32_t parse_immediate_commit(const std::string & immediate_commit_str, uint32_t default_value);
|
||||
static uint32_t parse_immediate_commit(const std::string & immediate_commit_str);
|
||||
static uint32_t parse_scheme(const std::string & scheme_str);
|
||||
};
|
||||
|
@@ -271,7 +271,7 @@ void http_co_t::close_connection()
|
||||
}
|
||||
if (peer_fd >= 0)
|
||||
{
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(peer_fd, 0, NULL);
|
||||
close(peer_fd);
|
||||
peer_fd = -1;
|
||||
}
|
||||
@@ -314,7 +314,7 @@ void http_co_t::start_connection()
|
||||
stackout();
|
||||
return;
|
||||
}
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
this->epoll_events |= epoll_events;
|
||||
handle_events();
|
||||
@@ -372,7 +372,7 @@ void http_co_t::handle_connect_result()
|
||||
}
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
this->epoll_events |= epoll_events;
|
||||
handle_events();
|
||||
|
@@ -16,32 +16,46 @@
|
||||
#endif
|
||||
|
||||
#include <sys/poll.h>
|
||||
#include <sys/eventfd.h>
|
||||
|
||||
msgr_iothread_t::msgr_iothread_t():
|
||||
ring(RINGLOOP_DEFAULT_SIZE, true),
|
||||
thread(&msgr_iothread_t::run, this)
|
||||
static uint64_t one = 1;
|
||||
|
||||
msgr_iothread_t::msgr_iothread_t()
|
||||
{
|
||||
eventfd = ring.register_eventfd();
|
||||
if (eventfd < 0)
|
||||
ring = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epmgr = new epoll_manager_t(ring);
|
||||
submit_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
|
||||
if (submit_eventfd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("failed to register eventfd: ") + strerror(-eventfd));
|
||||
throw std::runtime_error(std::string("failed to create eventfd: ")+strerror(errno));
|
||||
}
|
||||
epmgr->tfd->set_fd_handler(submit_eventfd, EPOLLIN, [this](int fd, int epoll_events)
|
||||
{
|
||||
// Reset eventfd counter
|
||||
uint64_t ctr = 0;
|
||||
int r = read(submit_eventfd, &ctr, 8);
|
||||
if (r < 0 && errno != EAGAIN && errno != EINTR)
|
||||
{
|
||||
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
|
||||
}
|
||||
ring->wakeup();
|
||||
});
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
read_requests();
|
||||
send_replies();
|
||||
ring->submit();
|
||||
};
|
||||
ring->register_consumer(&consumer);
|
||||
thread = new std::thread(&msgr_iothread_t::run, this);
|
||||
}
|
||||
|
||||
msgr_iothread_t::~msgr_iothread_t()
|
||||
{
|
||||
stop();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::add_sqe(io_uring_sqe & sqe)
|
||||
{
|
||||
mu.lock();
|
||||
queue.push_back((iothread_sqe_t){ .sqe = sqe, .data = std::move(*(ring_data_t*)sqe.user_data) });
|
||||
if (queue.size() == 1)
|
||||
{
|
||||
cond.notify_all();
|
||||
}
|
||||
mu.unlock();
|
||||
delete thread;
|
||||
delete epmgr;
|
||||
delete ring;
|
||||
}
|
||||
|
||||
void msgr_iothread_t::stop()
|
||||
@@ -53,65 +67,152 @@ void msgr_iothread_t::stop()
|
||||
return;
|
||||
}
|
||||
stopped = true;
|
||||
if (outer_loop_data)
|
||||
{
|
||||
outer_loop_data->callback = [](ring_data_t*){};
|
||||
}
|
||||
cond.notify_all();
|
||||
close(eventfd);
|
||||
write(submit_eventfd, &one, sizeof(one));
|
||||
mu.unlock();
|
||||
thread.join();
|
||||
thread->join();
|
||||
ring->unregister_consumer(&consumer);
|
||||
close(submit_eventfd);
|
||||
}
|
||||
|
||||
void msgr_iothread_t::add_to_ringloop(ring_loop_t *outer_loop)
|
||||
void msgr_iothread_t::add_client(osd_client_t *cl)
|
||||
{
|
||||
assert(!this->outer_loop || this->outer_loop == outer_loop);
|
||||
io_uring_sqe *sqe = outer_loop->get_sqe();
|
||||
assert(sqe != NULL);
|
||||
this->outer_loop = outer_loop;
|
||||
this->outer_loop_data = ((ring_data_t*)sqe->user_data);
|
||||
my_uring_prep_poll_add(sqe, eventfd, POLLIN);
|
||||
outer_loop_data->callback = [this](ring_data_t *data)
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
if (data->res < 0)
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
assert(!clients[cl->peer_fd]);
|
||||
clients[cl->peer_fd] = cl;
|
||||
epmgr->tfd->set_fd_handler(cl->peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// FIXME: Slight copypaste (see handle_peer_epoll)
|
||||
if (epoll_events & EPOLLIN)
|
||||
{
|
||||
throw std::runtime_error(std::string("eventfd poll failed: ") + strerror(-data->res));
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end())
|
||||
{
|
||||
auto cl = cl_it->second;
|
||||
cl->mu.lock();
|
||||
cl->read_ready++;
|
||||
if (cl->read_ready == 1)
|
||||
{
|
||||
read_ready_clients.push_back(peer_fd);
|
||||
ring->wakeup();
|
||||
}
|
||||
cl->mu.unlock();
|
||||
}
|
||||
}
|
||||
outer_loop_data = NULL;
|
||||
if (stopped)
|
||||
});
|
||||
mu.unlock();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::remove_client(osd_client_t *cl)
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
auto cl_it = clients.find(cl->peer_fd);
|
||||
if (cl_it != clients.end() && cl_it->second == cl)
|
||||
{
|
||||
clients.erase(cl->peer_fd);
|
||||
epmgr->tfd->set_fd_handler(cl->peer_fd, 0, NULL);
|
||||
}
|
||||
mu.unlock();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::wakeup_out(int peer_fd, ring_loop_t *outer_ring)
|
||||
{
|
||||
write_ready_mu.lock();
|
||||
if (!write_ready_clients.size())
|
||||
{
|
||||
io_uring_sqe* sqe = outer_ring->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
return;
|
||||
write(submit_eventfd, &one, sizeof(one));
|
||||
}
|
||||
add_to_ringloop(this->outer_loop);
|
||||
ring.loop();
|
||||
};
|
||||
else
|
||||
{
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [](ring_data_t*){};
|
||||
my_uring_prep_write(sqe, submit_eventfd, &one, sizeof(one), 0);
|
||||
}
|
||||
}
|
||||
write_ready_clients.push_back(peer_fd);
|
||||
write_ready_mu.unlock();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::read_requests()
|
||||
{
|
||||
// FIXME: Slight copypaste (see messenger_t::read_requests)
|
||||
auto to_recv = std::move(read_ready_clients);
|
||||
for (int i = 0; i < to_recv.size(); i++)
|
||||
{
|
||||
int peer_fd = to_recv[i];
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
osd_client_t *cl = cl_it->second;
|
||||
cl->mu.lock();
|
||||
auto ok = cl->try_recv(ring, false);
|
||||
cl->mu.unlock();
|
||||
if (!ok)
|
||||
{
|
||||
read_ready_clients.insert(read_ready_clients.end(), to_recv.begin()+i, to_recv.end());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::send_replies()
|
||||
{
|
||||
if (stopped)
|
||||
{
|
||||
return;
|
||||
}
|
||||
write_ready_mu.lock();
|
||||
auto to_send = std::move(write_ready_clients);
|
||||
write_ready_mu.unlock();
|
||||
for (int i = 0; i < to_send.size(); i++)
|
||||
{
|
||||
auto cl_it = clients.find(to_send[i]);
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto cl = cl_it->second;
|
||||
cl->mu.lock();
|
||||
auto ok = cl->try_send(ring, false/*, lock*/);
|
||||
cl->mu.unlock();
|
||||
if (!ok)
|
||||
{
|
||||
// ring is full (rare but what if...)
|
||||
write_ready_mu.lock();
|
||||
write_ready_clients.insert(write_ready_clients.end(), to_send.begin()+i, to_send.end());
|
||||
write_ready_mu.unlock();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::run()
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(mu);
|
||||
while (!stopped && !queue.size())
|
||||
cond.wait(lk);
|
||||
if (stopped)
|
||||
return;
|
||||
int i = 0;
|
||||
for (; i < queue.size(); i++)
|
||||
{
|
||||
io_uring_sqe *sqe = ring.get_sqe();
|
||||
if (!sqe)
|
||||
break;
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
*data = std::move(queue[i].data);
|
||||
*sqe = queue[i].sqe;
|
||||
sqe->user_data = (uint64_t)data;
|
||||
}
|
||||
queue.erase(queue.begin(), queue.begin()+i);
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
// We only want to offload sendmsg/recvmsg. Callbacks will be called in main thread
|
||||
ring.submit();
|
||||
ring->loop();
|
||||
mu.unlock();
|
||||
ring->wait();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +236,7 @@ void osd_messenger_t::init()
|
||||
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
||||
fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
|
||||
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
||||
tfd->set_fd_handler(rdma_context->channel->fd, EPOLLIN, [this](int notify_fd, int epoll_events)
|
||||
{
|
||||
handle_rdma_events();
|
||||
});
|
||||
@@ -149,8 +250,37 @@ void osd_messenger_t::init()
|
||||
{
|
||||
auto iot = new msgr_iothread_t();
|
||||
iothreads.push_back(iot);
|
||||
iot->add_to_ringloop(ringloop);
|
||||
}
|
||||
immediates_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
|
||||
if (immediates_eventfd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("failed to create set_immediate eventfd: ")+strerror(errno));
|
||||
}
|
||||
tfd->set_fd_handler(immediates_eventfd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Reset eventfd counter
|
||||
uint64_t ctr = 0;
|
||||
int r = read(immediates_eventfd, &ctr, 8);
|
||||
if (r < 0 && errno != EAGAIN && errno != EINTR)
|
||||
{
|
||||
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
immediates_mu.lock();
|
||||
auto to_run = std::move(immediates);
|
||||
immediates_mu.unlock();
|
||||
if (!to_run.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
for (auto & cb: to_run)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
}
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||
{
|
||||
@@ -229,6 +359,12 @@ void osd_messenger_t::init()
|
||||
|
||||
osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
if (immediates_eventfd >= 0)
|
||||
{
|
||||
tfd->set_fd_handler(immediates_eventfd, 0, NULL);
|
||||
close(immediates_eventfd);
|
||||
immediates_eventfd = -1;
|
||||
}
|
||||
if (keepalive_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(keepalive_timer_id);
|
||||
@@ -283,7 +419,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
this->rdma_odp = config["rdma_odp"].bool_value();
|
||||
#endif
|
||||
if (!osd_num)
|
||||
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
|
||||
this->iothread_count = config["client_iothread_count"].is_null() ? 4 : (uint32_t)config["client_iothread_count"].uint64_value();
|
||||
else
|
||||
this->iothread_count = (uint32_t)config["osd_iothread_count"].uint64_value();
|
||||
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
||||
@@ -376,6 +512,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
{
|
||||
fprintf(stderr, "Connecting to OSD %ju at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
|
||||
}
|
||||
clients[peer_fd]->msgr = this;
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = peer_port;
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
@@ -383,7 +520,8 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
clients[peer_fd]->connect_timeout_id = -1;
|
||||
clients[peer_fd]->osd_num = peer_osd;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Either OUT (connected) or HUP
|
||||
handle_connect_epoll(peer_fd);
|
||||
@@ -424,7 +562,11 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
cl->peer_state = PEER_CONNECTED;
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
if (iothreads.size())
|
||||
{
|
||||
iothreads[peer_fd % iothreads.size()]->add_client(cl);
|
||||
}
|
||||
tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
});
|
||||
@@ -608,7 +750,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
fprintf(stderr, "Connected to OSD %ju using RDMA\n", cl->osd_num);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(cl->peer_fd, 0, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
@@ -643,13 +785,19 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
clients[peer_fd]->msgr = this;
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
|
||||
// Add FD to epoll
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
if (iothreads.size())
|
||||
{
|
||||
iothreads[peer_fd % iothreads.size()]->add_client(clients[peer_fd]);
|
||||
}
|
||||
tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
});
|
||||
|
@@ -11,6 +11,7 @@
|
||||
#include <map>
|
||||
#include <deque>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "json11/json11.hpp"
|
||||
@@ -45,8 +46,13 @@ struct msgr_rdma_connection_t;
|
||||
struct msgr_rdma_context_t;
|
||||
#endif
|
||||
|
||||
struct osd_messenger_t;
|
||||
|
||||
struct osd_client_t
|
||||
{
|
||||
std::mutex mu;
|
||||
osd_messenger_t *msgr = NULL;
|
||||
|
||||
int refs = 0;
|
||||
|
||||
sockaddr_storage peer_addr;
|
||||
@@ -59,6 +65,7 @@ struct osd_client_t
|
||||
osd_num_t osd_num = 0;
|
||||
|
||||
void *in_buf = NULL;
|
||||
uint32_t receive_buffer_size = 0;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
msgr_rdma_connection_t *rdma_conn = NULL;
|
||||
@@ -89,6 +96,17 @@ struct osd_client_t
|
||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||
|
||||
~osd_client_t();
|
||||
|
||||
bool try_send(ring_loop_t *ringloop, bool use_sync_send_recv);
|
||||
int handle_send(int result);
|
||||
|
||||
bool try_recv(ring_loop_t *ringloop, bool use_sync_send_recv);
|
||||
int handle_read(int result);
|
||||
bool handle_read_buffer(void *curbuf, int remain);
|
||||
bool handle_finished_read();
|
||||
void handle_op_hdr();
|
||||
bool handle_reply_hdr();
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
@@ -111,41 +129,50 @@ struct osd_op_stats_t
|
||||
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
|
||||
};
|
||||
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <thread>
|
||||
|
||||
#ifdef __MOCK__
|
||||
class msgr_iothread_t;
|
||||
#else
|
||||
struct iothread_sqe_t
|
||||
{
|
||||
io_uring_sqe sqe;
|
||||
ring_data_t data;
|
||||
};
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
|
||||
class msgr_iothread_t
|
||||
{
|
||||
protected:
|
||||
ring_loop_t ring;
|
||||
ring_loop_t *outer_loop = NULL;
|
||||
ring_data_t *outer_loop_data = NULL;
|
||||
int eventfd = -1;
|
||||
ring_loop_t *ring = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
ring_consumer_t consumer;
|
||||
int submit_eventfd = -1;
|
||||
bool stopped = false;
|
||||
std::mutex mu;
|
||||
std::condition_variable cond;
|
||||
std::vector<iothread_sqe_t> queue;
|
||||
std::thread thread;
|
||||
std::map<int, osd_client_t*> clients;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::mutex write_ready_mu;
|
||||
std::vector<int> write_ready_clients;
|
||||
std::thread *thread = NULL;
|
||||
|
||||
void run();
|
||||
|
||||
void read_requests();
|
||||
|
||||
void send_replies();
|
||||
|
||||
public:
|
||||
|
||||
void handle_client_read(osd_client_t *cl, int res);
|
||||
void handle_client_send(osd_client_t *cl, int res);
|
||||
|
||||
msgr_iothread_t();
|
||||
~msgr_iothread_t();
|
||||
|
||||
void add_sqe(io_uring_sqe & sqe);
|
||||
void add_client(osd_client_t *cl);
|
||||
|
||||
void remove_client(osd_client_t *cl);
|
||||
|
||||
void wakeup_out(int peer_fd, ring_loop_t *outer_ring);
|
||||
|
||||
void stop();
|
||||
void add_to_ringloop(ring_loop_t *outer_loop);
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -176,8 +203,10 @@ protected:
|
||||
std::vector<msgr_iothread_t*> iothreads;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
int immediates_eventfd = -1;
|
||||
std::mutex immediates_mu;
|
||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
std::vector<std::function<void()>> immediates;
|
||||
|
||||
public:
|
||||
timerfd_manager_t *tfd;
|
||||
@@ -195,10 +224,13 @@ public:
|
||||
void parse_config(const json11::Json & config);
|
||||
void connect_peer(uint64_t osd_num, json11::Json peer_state);
|
||||
void stop_client(int peer_fd, bool force = false, bool force_delete = false);
|
||||
void stop_client_from_iothread(osd_client_t *cl);
|
||||
void outbox_push(osd_op_t *cur_op);
|
||||
std::function<void(osd_op_t*)> exec_op;
|
||||
std::function<void(osd_num_t)> repeer_pgs;
|
||||
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
|
||||
void handle_client_read(osd_client_t *cl, int res);
|
||||
void handle_client_send(osd_client_t *cl, int res);
|
||||
void read_requests();
|
||||
void send_replies();
|
||||
void accept_connections(int listen_fd);
|
||||
@@ -218,6 +250,9 @@ public:
|
||||
void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len);
|
||||
void measure_exec(osd_op_t *cur_op);
|
||||
|
||||
void set_immediate(std::function<void()> cb);
|
||||
void set_immediate_or_run(std::function<void()> cb);
|
||||
|
||||
protected:
|
||||
void try_connect_peer(uint64_t osd_num);
|
||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||
@@ -228,15 +263,7 @@ protected:
|
||||
void cancel_osd_ops(osd_client_t *cl);
|
||||
void cancel_op(osd_op_t *op);
|
||||
|
||||
bool try_send(osd_client_t *cl);
|
||||
void handle_send(int result, osd_client_t *cl);
|
||||
|
||||
bool handle_read(int result, osd_client_t *cl);
|
||||
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||
bool handle_finished_read(osd_client_t *cl);
|
||||
void handle_op_hdr(osd_client_t *cl);
|
||||
bool handle_reply_hdr(osd_client_t *cl);
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
void handle_immediates();
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
void try_send_rdma(osd_client_t *cl);
|
||||
@@ -245,4 +272,6 @@ protected:
|
||||
bool try_recv_rdma(osd_client_t *cl);
|
||||
void handle_rdma_events();
|
||||
#endif
|
||||
|
||||
friend struct osd_client_t;
|
||||
};
|
||||
|
@@ -603,7 +603,7 @@ void osd_messenger_t::handle_rdma_events()
|
||||
if (!is_send)
|
||||
{
|
||||
rc->cur_recv--;
|
||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||
if (!cl->handle_read_buffer(rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||
{
|
||||
// handle_read_buffer may stop the client
|
||||
continue;
|
||||
@@ -666,9 +666,5 @@ void osd_messenger_t::handle_rdma_events()
|
||||
}
|
||||
}
|
||||
} while (event_count > 0);
|
||||
for (auto cb: set_immediate)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
set_immediate.clear();
|
||||
handle_immediates();
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <unistd.h>
|
||||
#include "messenger.h"
|
||||
|
||||
void osd_messenger_t::read_requests()
|
||||
@@ -9,71 +10,119 @@ void osd_messenger_t::read_requests()
|
||||
{
|
||||
int peer_fd = read_ready_clients[i];
|
||||
osd_client_t *cl = clients[peer_fd];
|
||||
if (cl->read_msg.msg_iovlen)
|
||||
if (!cl->try_recv(ringloop, use_sync_send_recv))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (cl->read_remaining < receive_buffer_size)
|
||||
{
|
||||
cl->read_iov.iov_base = cl->in_buf;
|
||||
cl->read_iov.iov_len = receive_buffer_size;
|
||||
cl->read_msg.msg_iov = &cl->read_iov;
|
||||
cl->read_msg.msg_iovlen = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->read_iov.iov_base = 0;
|
||||
cl->read_iov.iov_len = cl->read_remaining;
|
||||
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
|
||||
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
|
||||
}
|
||||
cl->refs++;
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||
io_uring_sqe sqe_local;
|
||||
ring_data_t data_local;
|
||||
sqe_local.user_data = (uint64_t)&data_local;
|
||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||
if (!sqe)
|
||||
{
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
||||
return;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_read(data->res, cl); };
|
||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
|
||||
if (iothread)
|
||||
{
|
||||
iothread->add_sqe(sqe_local);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int result = recvmsg(peer_fd, &cl->read_msg, 0);
|
||||
if (result < 0)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
handle_read(result, cl);
|
||||
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
read_ready_clients.clear();
|
||||
if (!iothreads.size())
|
||||
{
|
||||
handle_immediates();
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||
bool osd_client_t::try_recv(ring_loop_t *ringloop, bool use_sync_send_recv)
|
||||
{
|
||||
bool ret = false;
|
||||
auto cl = this;
|
||||
if (cl->read_msg.msg_iovlen)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (cl->read_remaining < cl->receive_buffer_size)
|
||||
{
|
||||
cl->read_iov.iov_base = cl->in_buf;
|
||||
cl->read_iov.iov_len = cl->receive_buffer_size;
|
||||
cl->read_msg.msg_iov = &cl->read_iov;
|
||||
cl->read_msg.msg_iovlen = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->read_iov.iov_base = 0;
|
||||
cl->read_iov.iov_len = cl->read_remaining;
|
||||
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
|
||||
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
|
||||
}
|
||||
cl->refs++;
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
return false;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
if (msgr->iothreads.size())
|
||||
{
|
||||
data->callback = [this](ring_data_t *data) { msgr->iothreads[peer_fd % msgr->iothreads.size()]->handle_client_read(this, data->res); };
|
||||
}
|
||||
else
|
||||
{
|
||||
data->callback = [this](ring_data_t *data) { msgr->handle_client_read(this, data->res); };
|
||||
}
|
||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
int result = recvmsg(peer_fd, &cl->read_msg, 0);
|
||||
if (result < 0)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
msgr->handle_client_read(this, result);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_client_read(osd_client_t *cl, int res)
|
||||
{
|
||||
res = cl->handle_read(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
delete cl;
|
||||
}
|
||||
else if (res == -EIO)
|
||||
{
|
||||
stop_client(cl->peer_fd);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::handle_client_read(osd_client_t *cl, int res)
|
||||
{
|
||||
cl->mu.lock();
|
||||
res = cl->handle_read(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
cl->msgr->set_immediate([cl]() { delete cl; });
|
||||
}
|
||||
cl->mu.unlock();
|
||||
if (res == -EIO)
|
||||
{
|
||||
cl->msgr->stop_client_from_iothread(cl);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
ring->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
int osd_client_t::handle_read(int result)
|
||||
{
|
||||
auto cl = this;
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
return false;
|
||||
return -ENOENT;
|
||||
}
|
||||
if (result <= 0 && result != -EAGAIN && result != -EINTR)
|
||||
{
|
||||
@@ -82,27 +131,14 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||
{
|
||||
fprintf(stderr, "Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||
}
|
||||
stop_client(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
if (result == -EAGAIN || result == -EINTR || result < cl->read_iov.iov_len)
|
||||
{
|
||||
cl->read_ready--;
|
||||
if (cl->read_ready > 0)
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
}
|
||||
else
|
||||
{
|
||||
read_ready_clients.push_back(cl->peer_fd);
|
||||
return -EIO;
|
||||
}
|
||||
int expected = cl->read_iov.iov_len;
|
||||
if (result > 0)
|
||||
{
|
||||
if (cl->read_iov.iov_base == cl->in_buf)
|
||||
{
|
||||
if (!handle_read_buffer(cl, cl->in_buf, result))
|
||||
{
|
||||
goto fin;
|
||||
}
|
||||
handle_read_buffer(cl->in_buf, result);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -111,28 +147,25 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||
cl->recv_list.eat(result);
|
||||
if (cl->recv_list.done >= cl->recv_list.count)
|
||||
{
|
||||
if (!handle_finished_read(cl))
|
||||
{
|
||||
goto fin;
|
||||
}
|
||||
handle_finished_read();
|
||||
}
|
||||
}
|
||||
if (result >= cl->read_iov.iov_len)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
fin:
|
||||
for (auto cb: set_immediate)
|
||||
if (result == -EAGAIN || result == -EINTR || result < expected)
|
||||
{
|
||||
cb();
|
||||
cl->read_ready--;
|
||||
assert(cl->read_ready >= 0);
|
||||
}
|
||||
set_immediate.clear();
|
||||
return ret;
|
||||
if (cl->read_ready > 0)
|
||||
{
|
||||
return -EAGAIN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
|
||||
bool osd_client_t::handle_read_buffer(void *curbuf, int remain)
|
||||
{
|
||||
auto cl = this;
|
||||
// Compose operation(s) from the buffer
|
||||
while (remain > 0)
|
||||
{
|
||||
@@ -168,7 +201,7 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
|
||||
}
|
||||
if (cl->recv_list.done >= cl->recv_list.count)
|
||||
{
|
||||
if (!handle_finished_read(cl))
|
||||
if (!handle_finished_read())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -177,21 +210,20 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
|
||||
return true;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
bool osd_client_t::handle_finished_read()
|
||||
{
|
||||
cl->ping_time_remaining = 0;
|
||||
cl->idle_time_remaining = osd_idle_timeout;
|
||||
auto cl = this;
|
||||
cl->recv_list.reset();
|
||||
if (cl->read_state == CL_READ_HDR)
|
||||
{
|
||||
if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
|
||||
return handle_reply_hdr(cl);
|
||||
return handle_reply_hdr();
|
||||
else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
|
||||
handle_op_hdr(cl);
|
||||
handle_op_hdr();
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
||||
stop_client(cl->peer_fd);
|
||||
msgr->stop_client_from_iothread(cl);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -199,7 +231,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
{
|
||||
// Operation is ready
|
||||
cl->received_ops.push_back(cl->read_op);
|
||||
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
|
||||
msgr->set_immediate([msgr = this->msgr, op = cl->read_op, cl]() { msgr->exec_op(op); });
|
||||
cl->read_op = NULL;
|
||||
cl->read_state = 0;
|
||||
}
|
||||
@@ -217,8 +249,9 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
void osd_client_t::handle_op_hdr()
|
||||
{
|
||||
auto cl = this;
|
||||
osd_op_t *cur_op = cl->read_op;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
|
||||
{
|
||||
@@ -295,20 +328,21 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
{
|
||||
// Operation is ready
|
||||
cl->received_ops.push_back(cur_op);
|
||||
set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
|
||||
msgr->set_immediate([msgr = this->msgr, cur_op, cl]() { msgr->exec_op(cur_op); });
|
||||
cl->read_op = NULL;
|
||||
cl->read_state = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
bool osd_client_t::handle_reply_hdr()
|
||||
{
|
||||
auto cl = this;
|
||||
auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
|
||||
if (req_it == cl->sent_ops.end())
|
||||
{
|
||||
// Command out of sync. Drop connection
|
||||
fprintf(stderr, "Client %d command out of sync: id %ju\n", cl->peer_fd, cl->read_op->req.hdr.id);
|
||||
stop_client(cl->peer_fd);
|
||||
msgr->stop_client_from_iothread(cl);
|
||||
return false;
|
||||
}
|
||||
osd_op_t *op = req_it->second;
|
||||
@@ -325,7 +359,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %jd+%u\n",
|
||||
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
|
||||
cl->sent_ops[op->req.hdr.id] = op;
|
||||
stop_client(cl->peer_fd);
|
||||
msgr->stop_client_from_iothread(cl);
|
||||
return false;
|
||||
}
|
||||
if (bmp_len > 0)
|
||||
@@ -401,24 +435,92 @@ reuse:
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_reply_ready(osd_op_t *op)
|
||||
void osd_client_t::handle_reply_ready(osd_op_t *op)
|
||||
{
|
||||
// Measure subop latency
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
if (!stats.subop_stat_count[op->req.hdr.opcode])
|
||||
msgr->set_immediate([msgr = this->msgr, op, cl = this]()
|
||||
{
|
||||
// Measure subop latency
|
||||
auto & stats = msgr->stats;
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
|
||||
}
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
set_immediate.push_back([op]()
|
||||
{
|
||||
if (!stats.subop_stat_count[op->req.hdr.opcode])
|
||||
{
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
|
||||
}
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
});
|
||||
}
|
||||
|
||||
static uint64_t one = 1;
|
||||
|
||||
void osd_messenger_t::set_immediate(std::function<void()> cb/*, ring_loop_t *ringloop*/)
|
||||
{
|
||||
if (!iothreads.size())
|
||||
{
|
||||
immediates.push_back(cb);
|
||||
return;
|
||||
}
|
||||
immediates_mu.lock();
|
||||
bool wakeup_main_thread = !immediates.size();
|
||||
immediates.push_back(cb);
|
||||
immediates_mu.unlock();
|
||||
if (wakeup_main_thread)
|
||||
{
|
||||
// io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
|
||||
// if (!sqe)
|
||||
// {
|
||||
write(immediates_eventfd, &one, sizeof(one));
|
||||
// FIXME: Can't use ringloop here, oops
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
// data->callback = [](ring_data_t*){};
|
||||
// my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::set_immediate_or_run(std::function<void()> cb/*, ring_loop_t *ringloop*/)
|
||||
{
|
||||
if (!iothreads.size())
|
||||
{
|
||||
cb();
|
||||
return;
|
||||
}
|
||||
immediates_mu.lock();
|
||||
bool wakeup_main_thread = !immediates.size();
|
||||
immediates.push_back(cb);
|
||||
immediates_mu.unlock();
|
||||
if (wakeup_main_thread)
|
||||
{
|
||||
// io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
|
||||
// if (!sqe)
|
||||
// {
|
||||
write(immediates_eventfd, &one, sizeof(one));
|
||||
// FIXME: Can't use ringloop here, oops
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
// data->callback = [](ring_data_t*){};
|
||||
// my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_immediates()
|
||||
{
|
||||
auto to_run = std::move(immediates);
|
||||
for (auto & cb: to_run)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
}
|
||||
|
@@ -15,10 +15,17 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
||||
}
|
||||
else
|
||||
else if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
measure_exec(cur_op);
|
||||
}
|
||||
if (iothreads.size())
|
||||
{
|
||||
cl->mu.lock();
|
||||
}
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
// Check that operation actually belongs to this client
|
||||
// FIXME: Review if this is still needed
|
||||
bool found = false;
|
||||
for (auto it = cl->received_ops.begin(); it != cl->received_ops.end(); it++)
|
||||
{
|
||||
@@ -32,6 +39,10 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
if (!found)
|
||||
{
|
||||
delete cur_op;
|
||||
if (iothreads.size())
|
||||
{
|
||||
cl->mu.unlock();
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -39,7 +50,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
measure_exec(cur_op);
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->reply.buf, .iov_len = OSD_PACKET_SIZE });
|
||||
}
|
||||
else
|
||||
@@ -108,21 +118,36 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->peer_state == PEER_RDMA)
|
||||
{
|
||||
if (iothreads.size())
|
||||
{
|
||||
cl->mu.unlock();
|
||||
}
|
||||
try_send_rdma(cl);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (!ringloop)
|
||||
if (iothreads.size())
|
||||
{
|
||||
int should_wakeup = !cl->write_msg.msg_iovlen && !cl->write_state;
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
cl->mu.unlock();
|
||||
if (should_wakeup)
|
||||
{
|
||||
auto iot = iothreads[cl->peer_fd % iothreads.size()];
|
||||
iot->wakeup_out(cl->peer_fd, ringloop);
|
||||
}
|
||||
}
|
||||
else if (!ringloop)
|
||||
{
|
||||
// FIXME: It's worse because it doesn't allow batching
|
||||
while (cl->outbox.size())
|
||||
{
|
||||
try_send(cl);
|
||||
cl->try_send(NULL, true);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
|
||||
if ((cl->write_msg.msg_iovlen > 0 || !cl->try_send(ringloop, use_sync_send_recv)) && (cl->write_state == 0))
|
||||
{
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
@@ -180,8 +205,9 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
bool osd_client_t::try_send(ring_loop_t *ringloop, bool use_sync_send_recv)
|
||||
{
|
||||
auto cl = this;
|
||||
int peer_fd = cl->peer_fd;
|
||||
if (!cl->send_list.size() || cl->write_msg.msg_iovlen > 0)
|
||||
{
|
||||
@@ -189,11 +215,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
}
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||
io_uring_sqe sqe_local;
|
||||
ring_data_t data_local;
|
||||
sqe_local.user_data = (uint64_t)&data_local;
|
||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
return false;
|
||||
@@ -202,12 +224,15 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||
cl->refs++;
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
||||
if (iothread)
|
||||
if (msgr->iothreads.size())
|
||||
{
|
||||
iothread->add_sqe(sqe_local);
|
||||
data->callback = [this](ring_data_t *data) { msgr->iothreads[this->peer_fd % msgr->iothreads.size()]->handle_client_send(this, data->res); };
|
||||
}
|
||||
else
|
||||
{
|
||||
data->callback = [this](ring_data_t *data) { msgr->handle_client_send(this, data->res); };
|
||||
}
|
||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -219,18 +244,68 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
handle_send(result, cl);
|
||||
msgr->handle_client_send(this, result);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_client_send(osd_client_t *cl, int res)
|
||||
{
|
||||
res = cl->handle_send(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
delete cl;
|
||||
}
|
||||
else if (res == -EIO)
|
||||
{
|
||||
stop_client(cl->peer_fd);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
write_ready_clients.push_back(cl->peer_fd);
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::handle_client_send(osd_client_t *cl, int res)
|
||||
{
|
||||
cl->mu.lock();
|
||||
res = cl->handle_send(res);
|
||||
if (res == -ENOENT)
|
||||
{
|
||||
if (!cl->refs)
|
||||
cl->msgr->set_immediate([cl]() { delete cl; });
|
||||
}
|
||||
cl->mu.unlock();
|
||||
if (res == -EIO)
|
||||
{
|
||||
cl->msgr->stop_client_from_iothread(cl);
|
||||
}
|
||||
else if (res == -EAGAIN)
|
||||
{
|
||||
write_ready_mu.lock();
|
||||
write_ready_clients.push_back(cl->peer_fd);
|
||||
write_ready_mu.unlock();
|
||||
ring->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::send_replies()
|
||||
{
|
||||
if (iothreads.size())
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||
{
|
||||
int peer_fd = write_ready_clients[i];
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end() && !try_send(cl_it->second))
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto cl = cl_it->second;
|
||||
if (!cl->try_send(ringloop, use_sync_send_recv))
|
||||
{
|
||||
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
|
||||
return;
|
||||
@@ -239,24 +314,20 @@ void osd_messenger_t::send_replies()
|
||||
write_ready_clients.clear();
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
int osd_client_t::handle_send(int result)
|
||||
{
|
||||
auto cl = this;
|
||||
cl->write_msg.msg_iovlen = 0;
|
||||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
return;
|
||||
return -ENOENT;
|
||||
}
|
||||
if (result < 0 && result != -EAGAIN && result != -EINTR)
|
||||
{
|
||||
// this is a client socket, so don't panic. just disconnect it
|
||||
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||
stop_client(cl->peer_fd);
|
||||
return;
|
||||
return -EIO;
|
||||
}
|
||||
if (result >= 0)
|
||||
{
|
||||
@@ -269,7 +340,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||
{
|
||||
// Reply fully sent
|
||||
delete cl->outbox[done].op;
|
||||
msgr->set_immediate_or_run([op = cl->outbox[done].op] { delete op; });
|
||||
}
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
@@ -299,26 +370,35 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
{
|
||||
// FIXME: Do something better than just forgetting the FD
|
||||
// FIXME: Ignore pings during RDMA state transition
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
msgr->set_immediate_or_run([cl = this, msgr = this->msgr, peer_fd = this->peer_fd]()
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
auto cl_it = msgr->clients.find(peer_fd);
|
||||
if (cl_it == msgr->clients.end() || cl_it->second != cl)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
return;
|
||||
}
|
||||
if (msgr->log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", peer_fd);
|
||||
}
|
||||
msgr->tfd->set_fd_handler(peer_fd, 0, [msgr](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
msgr->handle_peer_epoll(peer_fd, epoll_events);
|
||||
}
|
||||
});
|
||||
// Add the initial receive request
|
||||
msgr->try_recv_rdma(cl);
|
||||
});
|
||||
// Add the initial receive request
|
||||
try_recv_rdma(cl);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (cl->write_state != 0)
|
||||
{
|
||||
write_ready_clients.push_back(cl->peer_fd);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@@ -11,6 +11,7 @@
|
||||
|
||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
{
|
||||
cl->mu.lock();
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
cancel_ops.resize(cl->sent_ops.size());
|
||||
int i = 0;
|
||||
@@ -20,6 +21,7 @@ void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
}
|
||||
cl->sent_ops.clear();
|
||||
cl->outbox.clear();
|
||||
cl->mu.unlock();
|
||||
for (auto op: cancel_ops)
|
||||
{
|
||||
cancel_op(op);
|
||||
@@ -53,8 +55,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
return;
|
||||
}
|
||||
osd_client_t *cl = it->second;
|
||||
cl->mu.lock();
|
||||
if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
cl->mu.unlock();
|
||||
return;
|
||||
}
|
||||
if (log_level > 0)
|
||||
@@ -71,6 +75,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
||||
cl->refs++;
|
||||
cl->peer_state = PEER_STOPPED;
|
||||
cl->mu.unlock();
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// ...and forget OSD peer
|
||||
@@ -78,7 +83,11 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// Then remove FD from the eventloop so we don't accidentally read something
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(peer_fd, 0, NULL);
|
||||
if (iothreads.size())
|
||||
{
|
||||
iothreads[peer_fd % iothreads.size()]->remove_client(cl);
|
||||
}
|
||||
if (cl->connect_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(cl->connect_timeout_id);
|
||||
@@ -108,17 +117,24 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
repeer_pgs(cl->osd_num);
|
||||
}
|
||||
// Then cancel all operations
|
||||
cl->mu.lock();
|
||||
if (cl->read_op)
|
||||
{
|
||||
if (!cl->read_op->callback)
|
||||
auto op = cl->read_op;
|
||||
cl->read_op = NULL;
|
||||
cl->mu.unlock();
|
||||
if (!op->callback)
|
||||
{
|
||||
delete cl->read_op;
|
||||
delete op;
|
||||
}
|
||||
else
|
||||
{
|
||||
cancel_op(cl->read_op);
|
||||
cancel_op(op);
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->mu.unlock();
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
@@ -131,11 +147,32 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
{
|
||||
clients.erase(it);
|
||||
}
|
||||
cl->mu.lock();
|
||||
cl->refs--;
|
||||
if (cl->refs <= 0 || force_delete)
|
||||
{
|
||||
cl->mu.unlock();
|
||||
delete cl;
|
||||
}
|
||||
else
|
||||
cl->mu.unlock();
|
||||
}
|
||||
|
||||
void osd_messenger_t::stop_client_from_iothread(osd_client_t *cl)
|
||||
{
|
||||
if (!iothreads.size())
|
||||
{
|
||||
stop_client(cl->peer_fd);
|
||||
return;
|
||||
}
|
||||
set_immediate([this, cl, peer_fd = cl->peer_fd]()
|
||||
{
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end() && cl_it->second == cl)
|
||||
{
|
||||
stop_client(peer_fd);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
osd_client_t::~osd_client_t()
|
||||
|
@@ -253,7 +253,7 @@ nla_put_failure:
|
||||
const char *exe_name = NULL;
|
||||
|
||||
const char *help_text =
|
||||
"Vitastor NBD proxy " VITASTOR_VERSION "\n"
|
||||
"Vitastor NBD proxy " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2020+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"COMMANDS:\n"
|
||||
@@ -655,7 +655,7 @@ help:
|
||||
ringloop->register_consumer(&consumer);
|
||||
// Add FD to epoll
|
||||
bool stop = false;
|
||||
epmgr->tfd->set_fd_handler(sockfd[0], false, [this, &stop](int peer_fd, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(sockfd[0], EPOLLIN, [this, &stop](int peer_fd, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 1.7.1
|
||||
Version: 1.6.1
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -384,28 +384,6 @@ int vitastor_c_inode_get_readonly(void *handle)
|
||||
return watch->cfg.readonly;
|
||||
}
|
||||
|
||||
uint64_t vitastor_c_inode_get_parent_id(void *handle)
|
||||
{
|
||||
inode_watch_t *watch = (inode_watch_t*)handle;
|
||||
return watch->cfg.parent_id;
|
||||
}
|
||||
|
||||
char* vitastor_c_inode_get_meta(void *handle)
|
||||
{
|
||||
inode_watch_t *watch = (inode_watch_t*)handle;
|
||||
if (watch->cfg.meta.is_null())
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
return strdup(watch->cfg.meta.dump().c_str());
|
||||
}
|
||||
|
||||
uint64_t vitastor_c_inode_get_mod_revision(void *handle)
|
||||
{
|
||||
inode_watch_t *watch = (inode_watch_t*)handle;
|
||||
return watch->cfg.mod_revision;
|
||||
}
|
||||
|
||||
uint32_t vitastor_c_inode_get_immediate_commit(vitastor_c *client, uint64_t inode_num)
|
||||
{
|
||||
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
|
||||
|
@@ -69,9 +69,6 @@ void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler c
|
||||
void vitastor_c_close_watch(vitastor_c *client, void *handle);
|
||||
uint64_t vitastor_c_inode_get_size(void *handle);
|
||||
uint64_t vitastor_c_inode_get_num(void *handle);
|
||||
uint64_t vitastor_c_inode_get_parent_id(void *handle);
|
||||
char* vitastor_c_inode_get_meta(void *handle);
|
||||
uint64_t vitastor_c_inode_get_mod_revision(void *handle);
|
||||
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
|
||||
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
|
||||
int vitastor_c_inode_get_readonly(void *handle);
|
||||
|
@@ -12,9 +12,7 @@ add_library(vitastor_cli STATIC
|
||||
cli_ls.cpp
|
||||
cli_create.cpp
|
||||
cli_modify.cpp
|
||||
cli_modify_osd.cpp
|
||||
cli_osd_tree.cpp
|
||||
cli_pg_ls.cpp
|
||||
cli_flatten.cpp
|
||||
cli_merge.cpp
|
||||
cli_rm_data.cpp
|
||||
|
@@ -17,7 +17,7 @@
|
||||
static const char *exe_name = NULL;
|
||||
|
||||
static const char* help_text =
|
||||
"Vitastor command-line tool " VITASTOR_VERSION "\n"
|
||||
"Vitastor command-line tool " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"COMMANDS:\n"
|
||||
@@ -70,7 +70,6 @@ static const char* help_text =
|
||||
" --wait-list Retrieve full objects listings before starting to remove objects.\n"
|
||||
" Requires more memory, but allows to show correct removal progress.\n"
|
||||
" --min-offset Purge only data starting with specified offset.\n"
|
||||
" --max-offset Purge only data before specified offset.\n"
|
||||
"\n"
|
||||
"vitastor-cli merge-data <from> <to> [--target <target>]\n"
|
||||
" Merge layer data without changing metadata. Merge <from>..<to> to <target>.\n"
|
||||
@@ -119,23 +118,11 @@ static const char* help_text =
|
||||
" With --dry-run only checks if deletion is possible without data loss and\n"
|
||||
" redundancy degradation.\n"
|
||||
"\n"
|
||||
"vitastor-cli osd-tree [-l|--long]\n"
|
||||
" Show current OSD tree, optionally with I/O statistics if -l is specified.\n"
|
||||
"vitastor-cli osd-tree\n"
|
||||
" Show current OSD tree.\n"
|
||||
"\n"
|
||||
"vitastor-cli osds|ls-osd|osd-ls [-l|--long]\n"
|
||||
" Show current OSDs as list, optionally with I/O statistics if -l is specified.\n"
|
||||
"\n"
|
||||
"vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>\n"
|
||||
" Set OSD reweight, tags or noout flag.\n"
|
||||
"\n"
|
||||
"vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]\n"
|
||||
" List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:\n"
|
||||
" --pool <pool name or number> Only list PGs of the given pool.\n"
|
||||
" --min <min pg number> Only list PGs with number >= min.\n"
|
||||
" --max <max pg number> Only list PGs with number <= max.\n"
|
||||
" Examples:\n"
|
||||
" vitastor-cli pg-list active+degraded\n"
|
||||
" vitastor-cli pg-list ^active\n"
|
||||
"vitastor-cli osds|ls-osd|osd-ls\n"
|
||||
" Show current OSDs as list.\n"
|
||||
"\n"
|
||||
"vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]\n"
|
||||
" Create a pool. Required parameters:\n"
|
||||
@@ -149,7 +136,7 @@ static const char* help_text =
|
||||
" --osd_tags <tag>[,<tag>]... Put pool only on OSDs tagged with all specified tags\n"
|
||||
" --block_size 128k Put pool only on OSDs with this data block size\n"
|
||||
" --bitmap_granularity 4k Put pool only on OSDs with this logical sector size\n"
|
||||
" --immediate_commit all Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
|
||||
" --immediate_commit none Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
|
||||
" --level_placement <rules> Use additional failure domain rules (example: \"dc=112233\")\n"
|
||||
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
||||
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
||||
@@ -419,23 +406,6 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
cfg["flat"] = true;
|
||||
action_cb = p->start_osd_tree(cfg);
|
||||
}
|
||||
else if (cmd[0] == "modify-osd")
|
||||
{
|
||||
// Modify OSD configuration
|
||||
if (cmd.size() > 1)
|
||||
cfg["osd_num"] = cmd[1];
|
||||
action_cb = p->start_modify_osd(cfg);
|
||||
}
|
||||
else if (cmd[0] == "pg-list" || cmd[0] == "pg-ls" || cmd[0] == "list-pg" || cmd[0] == "ls-pg" || cmd[0] == "ls-pgs")
|
||||
{
|
||||
// Modify OSD configuration
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cmd.erase(cmd.begin(), cmd.begin()+1);
|
||||
cfg["pg_state"] = cmd;
|
||||
}
|
||||
action_cb = p->start_pg_list(cfg);
|
||||
}
|
||||
else if (cmd[0] == "create-pool" || cmd[0] == "pool-create")
|
||||
{
|
||||
// Create a new pool
|
||||
|
@@ -65,9 +65,7 @@ public:
|
||||
std::function<bool(cli_result_t &)> start_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_merge(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_modify(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_modify_osd(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_osd_tree(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pg_list(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pool_create(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pool_modify(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_pool_rm(json11::Json);
|
||||
|
@@ -1,210 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "str_util.h"
|
||||
#include "http_client.h"
|
||||
|
||||
// Reweight OSD, change tags or set noout flag
|
||||
struct osd_changer_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
uint64_t osd_num = 0;
|
||||
bool set_tags = false;
|
||||
std::vector<std::string> new_tags;
|
||||
bool set_reweight = false;
|
||||
double new_reweight = 1;
|
||||
bool set_noout = false;
|
||||
double new_noout = false;
|
||||
bool force = false;
|
||||
|
||||
json11::Json::object osd_cfg;
|
||||
uint64_t osd_cfg_mod_rev = 0;
|
||||
json11::Json::array compare, success;
|
||||
|
||||
int state = 0;
|
||||
std::function<bool(cli_result_t &)> cb;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
if (!osd_num)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "OSD number is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!set_tags && !set_reweight && !set_noout)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Nothing to update" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (set_reweight && new_reweight < 0)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Reweight can't be negative" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
});
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
{
|
||||
auto osd_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]).value;
|
||||
if (!osd_stats.is_object() && !force)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "OSD "+std::to_string(osd_num)+" does not exist. Use --force to set configuration anyway" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]);
|
||||
osd_cfg_mod_rev = kv.mod_revision;
|
||||
osd_cfg = kv.value.object_items();
|
||||
if (set_reweight)
|
||||
{
|
||||
if (new_reweight != 1)
|
||||
osd_cfg["reweight"] = new_reweight;
|
||||
else
|
||||
osd_cfg.erase("reweight");
|
||||
}
|
||||
if (set_tags)
|
||||
{
|
||||
if (new_tags.size())
|
||||
osd_cfg["tags"] = new_tags;
|
||||
else
|
||||
osd_cfg.erase("tags");
|
||||
}
|
||||
if (set_noout)
|
||||
{
|
||||
if (new_noout)
|
||||
osd_cfg["noout"] = true;
|
||||
else
|
||||
osd_cfg.erase("noout");
|
||||
}
|
||||
compare.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", osd_cfg_mod_rev+1 },
|
||||
});
|
||||
if (!osd_cfg.size())
|
||||
{
|
||||
success.push_back(json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
success.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
|
||||
{ "value", base64_encode(json11::Json(osd_cfg).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "compare", compare },
|
||||
{ "success", success },
|
||||
});
|
||||
state = 2;
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value())
|
||||
{
|
||||
result = (cli_result_t){ .err = EAGAIN, .text = "OSD "+std::to_string(osd_num)+" configuration was modified by someone else, please repeat your request" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
result = (cli_result_t){
|
||||
.err = 0,
|
||||
.text = "OSD "+std::to_string(osd_num)+" configuration modified",
|
||||
.data = osd_cfg,
|
||||
};
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_modify_osd(json11::Json cfg)
|
||||
{
|
||||
auto changer = new osd_changer_t();
|
||||
changer->parent = this;
|
||||
changer->osd_num = cfg["osd_num"].uint64_value();
|
||||
if (!cfg["tags"].is_null())
|
||||
{
|
||||
changer->set_tags = true;
|
||||
if (cfg["tags"].is_string())
|
||||
{
|
||||
if (cfg["tags"].string_value() != "")
|
||||
changer->new_tags = explode(",", cfg["tags"].string_value(), true);
|
||||
}
|
||||
else if (cfg["tags"].is_array())
|
||||
{
|
||||
for (auto item: cfg["tags"].array_items())
|
||||
changer->new_tags.push_back(item.as_string());
|
||||
}
|
||||
}
|
||||
if (!cfg["reweight"].is_null())
|
||||
{
|
||||
changer->set_reweight = true;
|
||||
changer->new_reweight = cfg["reweight"].number_value();
|
||||
}
|
||||
if (!cfg["noout"].is_null())
|
||||
{
|
||||
changer->set_noout = true;
|
||||
changer->new_noout = json_is_true(cfg["noout"]);
|
||||
}
|
||||
changer->force = cfg["force"].bool_value();
|
||||
return [changer](cli_result_t & result)
|
||||
{
|
||||
changer->loop();
|
||||
if (changer->is_done())
|
||||
{
|
||||
result = changer->result;
|
||||
delete changer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -17,7 +17,6 @@ struct placement_osd_t
|
||||
uint64_t free;
|
||||
bool up;
|
||||
double reweight;
|
||||
bool noout;
|
||||
uint32_t block_size, bitmap_granularity, immediate_commit;
|
||||
};
|
||||
|
||||
@@ -133,10 +132,9 @@ resume_1:
|
||||
.free = kv.second["free"].uint64_value(),
|
||||
.up = parent->cli->st_cli.peer_states.find(kv.first) != parent->cli->st_cli.peer_states.end(),
|
||||
.reweight = 1,
|
||||
.noout = false,
|
||||
.block_size = (uint32_t)kv.second["data_block_size"].uint64_value(),
|
||||
.bitmap_granularity = (uint32_t)kv.second["bitmap_granularity"].uint64_value(),
|
||||
.immediate_commit = etcd_state_client_t::parse_immediate_commit(kv.second["immediate_commit"].string_value(), IMMEDIATE_NONE),
|
||||
.immediate_commit = etcd_state_client_t::parse_immediate_commit(kv.second["immediate_commit"].string_value()),
|
||||
};
|
||||
if (tree->nodes.find(osd.parent) == tree->nodes.end())
|
||||
{
|
||||
@@ -156,7 +154,6 @@ resume_1:
|
||||
for (auto & jtag: osd_cfg["tags"].array_items())
|
||||
osd.tags.push_back(jtag.string_value());
|
||||
}
|
||||
osd.noout = osd_cfg["noout"].bool_value();
|
||||
}
|
||||
auto np_it = node_placement.find(std::to_string(osd.num));
|
||||
if (np_it != node_placement.end())
|
||||
@@ -181,7 +178,7 @@ resume_1:
|
||||
return tree;
|
||||
}
|
||||
|
||||
void format_tree()
|
||||
std::string format_tree()
|
||||
{
|
||||
std::vector<std::string> node_seq = { "" };
|
||||
std::vector<int> indents = { -1 };
|
||||
@@ -201,39 +198,6 @@ resume_1:
|
||||
}
|
||||
}
|
||||
json11::Json::array fmt_items;
|
||||
if (parent->json_output)
|
||||
{
|
||||
for (int i = 1; i < node_seq.size(); i++)
|
||||
{
|
||||
auto & node = placement_tree->nodes.at(node_seq[i]);
|
||||
fmt_items.push_back(json11::Json::object{
|
||||
{ "type", node.level },
|
||||
{ "name", node.name },
|
||||
{ "parent", node.parent },
|
||||
});
|
||||
for (uint64_t osd_num: node.child_osds)
|
||||
{
|
||||
auto & osd = placement_tree->osds.at(osd_num);
|
||||
fmt_items.push_back(json11::Json::object{
|
||||
{ "type", "osd" },
|
||||
{ "name", osd.num },
|
||||
{ "parent", node.name },
|
||||
{ "up", osd.up ? "up" : "down" },
|
||||
{ "size", osd.size },
|
||||
{ "free", osd.free },
|
||||
{ "reweight", osd.reweight },
|
||||
{ "noout", osd.noout },
|
||||
{ "tags", osd.tags },
|
||||
{ "block", (uint64_t)osd.block_size },
|
||||
{ "bitmap", (uint64_t)osd.bitmap_granularity },
|
||||
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
|
||||
{ "op_stats", osd_stats[osd_num]["op_stats"] },
|
||||
});
|
||||
}
|
||||
}
|
||||
result.data = fmt_items;
|
||||
return;
|
||||
}
|
||||
for (int i = 1; i < node_seq.size(); i++)
|
||||
{
|
||||
auto & node = placement_tree->nodes.at(node_seq[i]);
|
||||
@@ -265,7 +229,6 @@ resume_1:
|
||||
{ "size", format_size(osd.size, false, true) },
|
||||
{ "used", format_q(100.0*(osd.size - osd.free)/osd.size)+" %" },
|
||||
{ "reweight", format_q(osd.reweight) },
|
||||
{ "noout", osd.noout ? "noout" : "-" },
|
||||
{ "tags", implode(",", osd.tags) },
|
||||
{ "block", format_size(osd.block_size, false, true) },
|
||||
{ "bitmap", format_size(osd.bitmap_granularity, false, true) },
|
||||
@@ -338,10 +301,6 @@ resume_1:
|
||||
{ "key", "commit" },
|
||||
{ "title", "IMM" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "noout" },
|
||||
{ "title", "NOOUT" },
|
||||
});
|
||||
if (show_stats)
|
||||
{
|
||||
cols.push_back(json11::Json::object{
|
||||
@@ -381,7 +340,7 @@ resume_1:
|
||||
{ "title", "LAT" },
|
||||
});
|
||||
}
|
||||
result.text = print_table(fmt_items, cols, parent->color);
|
||||
return print_table(fmt_items, cols, parent->color);
|
||||
}
|
||||
|
||||
void loop()
|
||||
@@ -392,7 +351,7 @@ resume_1:
|
||||
load_osd_tree();
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
format_tree();
|
||||
result.text = format_tree();
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
@@ -1,288 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2024
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
#include "str_util.h"
|
||||
|
||||
struct pg_lister_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
uint64_t pool_id = 0;
|
||||
std::string pool_name;
|
||||
std::vector<std::string> pg_state;
|
||||
uint64_t min_pg_num = 0;
|
||||
uint64_t max_pg_num = 0;
|
||||
|
||||
std::map<pool_pg_num_t, json11::Json> pg_stats;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done() { return state == 100; }
|
||||
|
||||
void load_pg_stats()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
if (pool_name != "")
|
||||
{
|
||||
pool_id = 0;
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
if (pp.second.name == pool_name)
|
||||
{
|
||||
pool_id = pp.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!pool_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/pgstats"+(pool_id ? "/"+std::to_string(pool_id)+"/" : "/")) },
|
||||
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/pgstats"+(pool_id ? "/"+std::to_string(pool_id)+"0" : "0")) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
});
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pgstats/", [&](pool_id_t pool_id, uint64_t pg_num, json11::Json value)
|
||||
{
|
||||
pg_stats[(pool_pg_num_t){ .pool_id = pool_id, .pg_num = (pg_num_t)pg_num }] = value;
|
||||
});
|
||||
}
|
||||
|
||||
void format_pgs()
|
||||
{
|
||||
uint64_t is_not = ((uint64_t)1 << 63);
|
||||
std::vector<uint64_t> masks;
|
||||
if (pg_state.size())
|
||||
{
|
||||
for (auto & st: pg_state)
|
||||
{
|
||||
if (st.size())
|
||||
{
|
||||
uint64_t mask = 0;
|
||||
size_t pos = 0;
|
||||
if (st[0] == '!' || st[0] == '^')
|
||||
{
|
||||
mask |= is_not;
|
||||
pos++;
|
||||
}
|
||||
size_t prev = pos;
|
||||
while (true)
|
||||
{
|
||||
if (pos < st.size() && (st[pos] >= 'a' && st[pos] <= 'z' || st[pos] == '_'))
|
||||
pos++;
|
||||
else
|
||||
{
|
||||
if (pos > prev)
|
||||
{
|
||||
std::string bit = st.substr(prev, pos-prev);
|
||||
bool found = false;
|
||||
for (int i = 0; i < pg_state_bit_count; i++)
|
||||
{
|
||||
if (pg_state_names[i] == bit)
|
||||
{
|
||||
mask |= (uint64_t)1 << i;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Unknown PG state "+bit };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
while (pos < st.size() && !(st[pos] >= 'a' && st[pos] <= 'z' || st[pos] == '_'))
|
||||
pos++;
|
||||
prev = pos;
|
||||
if (pos >= st.size())
|
||||
break;
|
||||
}
|
||||
}
|
||||
masks.push_back(mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
json11::Json::array pgs;
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
if ((!pool_id || pp.first == pool_id) && (pool_name == "" || pp.second.name == pool_name))
|
||||
{
|
||||
for (auto & pgp: pp.second.pg_config)
|
||||
{
|
||||
if (min_pg_num && pgp.first < min_pg_num || max_pg_num && pgp.first > max_pg_num)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (masks.size())
|
||||
{
|
||||
bool found = false;
|
||||
for (auto mask: masks)
|
||||
{
|
||||
if ((mask & is_not)
|
||||
? (pgp.second.cur_state & (mask & ~is_not)) != (mask & ~is_not)
|
||||
: ((pgp.second.cur_state & mask) == mask))
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
continue;
|
||||
}
|
||||
json11::Json::array state_names;
|
||||
for (int i = 0; i < pg_state_bit_count; i++)
|
||||
{
|
||||
if (pgp.second.cur_state & (1 << i))
|
||||
{
|
||||
state_names.push_back(std::string(pg_state_names[i]));
|
||||
}
|
||||
}
|
||||
if (!pgp.second.cur_state)
|
||||
{
|
||||
state_names.push_back("offline");
|
||||
}
|
||||
auto stat = pg_stats[(pool_pg_num_t){ .pool_id = pp.first, .pg_num = pgp.first }].object_items();
|
||||
stat.erase("write_osd_set");
|
||||
stat["pool_id"] = (uint64_t)pp.first;
|
||||
stat["pool_name"] = pp.second.name;
|
||||
stat["pg_num"] = (uint64_t)pgp.first;
|
||||
stat["pause"] = pgp.second.pause;
|
||||
stat["state"] = state_names;
|
||||
stat["cur_primary"] = pgp.second.cur_primary;
|
||||
stat["target_primary"] = pgp.second.primary;
|
||||
stat["target_set"] = pgp.second.target_set;
|
||||
stat["target_history"] = pgp.second.target_history;
|
||||
stat["all_peers"] = pgp.second.all_peers;
|
||||
stat["epoch"] = pgp.second.epoch;
|
||||
stat["next_scrub"] = pgp.second.next_scrub;
|
||||
if (!parent->json_output)
|
||||
{
|
||||
stat["fmt_state"] = implode("+", state_names);
|
||||
stat["fmt_primary"] = (!pgp.second.primary && !pgp.second.cur_primary
|
||||
? "-"
|
||||
: (std::to_string(pgp.second.cur_primary) + (pgp.second.primary == pgp.second.cur_primary
|
||||
? ""
|
||||
: "->"+std::to_string(pgp.second.primary))));
|
||||
stat["fmt_target_set"] = implode(",", stat["target_set"]);
|
||||
uint64_t pg_block = pp.second.data_block_size * (pp.second.scheme == POOL_SCHEME_REPLICATED
|
||||
? 1 : (pp.second.pg_size-pp.second.parity_chunks));
|
||||
stat["fmt_clean"] = format_size(stat["clean_count"].uint64_value() * pg_block);
|
||||
stat["fmt_misplaced"] = format_size(stat["misplaced_count"].uint64_value() * pg_block);
|
||||
stat["fmt_degraded"] = format_size(stat["degraded_count"].uint64_value() * pg_block);
|
||||
stat["fmt_incomplete"] = format_size(stat["incomplete_count"].uint64_value() * pg_block);
|
||||
}
|
||||
pgs.push_back(stat);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (parent->json_output)
|
||||
{
|
||||
result.data = pgs;
|
||||
return;
|
||||
}
|
||||
json11::Json::array cols;
|
||||
if (!pool_id)
|
||||
{
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "pool_name" },
|
||||
{ "title", "POOL" },
|
||||
});
|
||||
}
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "pg_num" },
|
||||
{ "title", "NUM" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "fmt_target_set" },
|
||||
{ "title", "OSD SET" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "fmt_primary" },
|
||||
{ "title", "PRIMARY" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "fmt_clean" },
|
||||
{ "title", "DATA CLEAN" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "fmt_misplaced" },
|
||||
{ "title", "MISPLACED" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "fmt_misplaced" },
|
||||
{ "title", "DEGRADED" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "fmt_incomplete" },
|
||||
{ "title", "INCOMPLETE" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "fmt_state" },
|
||||
{ "title", "STATE" },
|
||||
});
|
||||
result.text = print_table(pgs, cols, parent->color);
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
resume_1:
|
||||
load_pg_stats();
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
format_pgs();
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_pg_list(json11::Json cfg)
|
||||
{
|
||||
auto pg_lister = new pg_lister_t();
|
||||
pg_lister->parent = this;
|
||||
if (cfg["pool"].uint64_value())
|
||||
pg_lister->pool_id = cfg["pool"].uint64_value();
|
||||
else
|
||||
pg_lister->pool_name = cfg["pool"].string_value();
|
||||
for (auto & st: cfg["pg_state"].array_items())
|
||||
pg_lister->pg_state.push_back(st.string_value());
|
||||
if (cfg["pg_state"].is_string())
|
||||
pg_lister->pg_state.push_back(cfg["pg_state"].string_value());
|
||||
pg_lister->min_pg_num = cfg["min"].uint64_value();
|
||||
pg_lister->max_pg_num = cfg["max"].uint64_value();
|
||||
return [pg_lister](cli_result_t & result)
|
||||
{
|
||||
pg_lister->loop();
|
||||
if (pg_lister->is_done())
|
||||
{
|
||||
result = pg_lister->result;
|
||||
delete pg_lister;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -71,7 +71,8 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||
auto & key = kv_it->first;
|
||||
auto & value = kv_it->second;
|
||||
if (key == "pg_size" || key == "parity_chunks" || key == "pg_minsize" ||
|
||||
key == "pg_count" || key == "max_osd_combinations")
|
||||
key == "pg_count" || key == "max_osd_combinations" ||
|
||||
key == "bitmap_granularity" || key == "pg_stripe_size")
|
||||
{
|
||||
if (value.is_number() && value.uint64_value() != value.number_value() ||
|
||||
value.is_string() && !value.uint64_value() && value.string_value() != "0")
|
||||
@@ -80,14 +81,13 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||
}
|
||||
value = value.uint64_value();
|
||||
}
|
||||
else if (key == "block_size" || key == "bitmap_granularity" || key == "pg_stripe_size")
|
||||
else if (key == "block_size")
|
||||
{
|
||||
uint64_t sz = value.is_string() ? parse_size(value.string_value()) : value.uint64_value();
|
||||
if (!sz)
|
||||
uint64_t block_size = value.is_string() ? parse_size(value.string_value()) : value.uint64_value();
|
||||
if (!block_size)
|
||||
{
|
||||
return key+" must be an integer with or without size suffix (K/M/G/T)";
|
||||
}
|
||||
value = sz;
|
||||
}
|
||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
|
||||
@@ -319,7 +319,7 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||
}
|
||||
|
||||
// immediate_commit
|
||||
if (!cfg["immediate_commit"].is_null() && etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value(), UINT32_MAX) == UINT32_MAX)
|
||||
if (!cfg["immediate_commit"].is_null() && !etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value()))
|
||||
{
|
||||
return "immediate_commit must be one of \"all\", \"small\", or \"none\", but it is "+cfg["immediate_commit"].as_string();
|
||||
}
|
||||
|
@@ -19,9 +19,6 @@ struct pool_creator_t
|
||||
bool force = false;
|
||||
bool wait = false;
|
||||
|
||||
uint64_t block_size = 0, bitmap_granularity = 0;
|
||||
uint32_t immediate_commit = 0;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
@@ -190,23 +187,13 @@ resume_4:
|
||||
|
||||
if (cfg["pg_size"].uint64_value() > max_pg_size)
|
||||
{
|
||||
std::string pool_err = "Not enough matching OSDs to create pool."
|
||||
" Change parameters or add --force to create a degraded pool."
|
||||
"\n\nAt least "+std::to_string(cfg["pg_size"].uint64_value())+
|
||||
" (pg_size="+std::to_string(cfg["pg_size"].uint64_value())+") OSDs should have:"
|
||||
"\n- block_size "+format_size(block_size, false, true)+
|
||||
"\n- bitmap_granularity "+format_size(bitmap_granularity, false, true);
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
pool_err += "\n- immediate_commit all";
|
||||
else if (immediate_commit == IMMEDIATE_SMALL)
|
||||
pool_err += "\n- immediate_commit all or small";
|
||||
if (cfg["osd_tags"].array_items().size())
|
||||
pool_err += "\n- '"+implode("', '", cfg["osd_tags"])+(cfg["osd_tags"].array_items().size() > 1 ? "' tags" : "' tag");
|
||||
if (failure_domain != "osd")
|
||||
pool_err += "\n- different parent '"+failure_domain+"' nodes";
|
||||
result = (cli_result_t){
|
||||
.err = EINVAL,
|
||||
.text = pool_err,
|
||||
.text =
|
||||
"There are "+std::to_string(max_pg_size)+" \""+failure_domain+"\" failure domains with OSDs matching tags and"
|
||||
" block_size/bitmap_granularity/immediate_commit parameters, but you want to create a"
|
||||
" pool with "+cfg["pg_size"].as_string()+" OSDs from different failure domains in a PG."
|
||||
" Change parameters or add --force if you want to create a degraded pool and add OSDs later."
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
@@ -454,14 +441,14 @@ resume_8:
|
||||
// List of accepted osds
|
||||
std::vector<std::string> accepted_osds;
|
||||
|
||||
block_size = cfg["block_size"].uint64_value()
|
||||
uint64_t p_block_size = cfg["block_size"].uint64_value()
|
||||
? cfg["block_size"].uint64_value()
|
||||
: parent->cli->st_cli.global_block_size;
|
||||
bitmap_granularity = cfg["bitmap_granularity"].uint64_value()
|
||||
uint64_t p_bitmap_granularity = cfg["bitmap_granularity"].uint64_value()
|
||||
? cfg["bitmap_granularity"].uint64_value()
|
||||
: parent->cli->st_cli.global_bitmap_granularity;
|
||||
immediate_commit = cfg["immediate_commit"].is_string()
|
||||
? etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
||||
uint32_t p_immediate_commit = cfg["immediate_commit"].is_string()
|
||||
? etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value())
|
||||
: parent->cli->st_cli.global_immediate_commit;
|
||||
|
||||
for (size_t i = 0; i < osd_stats.size(); i++)
|
||||
@@ -469,10 +456,10 @@ resume_8:
|
||||
auto & os = osd_stats[i];
|
||||
// Get osd number
|
||||
auto osd_num = osds[i].as_string();
|
||||
if (!os["data_block_size"].is_null() && os["data_block_size"] != block_size ||
|
||||
!os["bitmap_granularity"].is_null() && os["bitmap_granularity"] != bitmap_granularity ||
|
||||
if (!os["data_block_size"].is_null() && os["data_block_size"] != p_block_size ||
|
||||
!os["bitmap_granularity"].is_null() && os["bitmap_granularity"] != p_bitmap_granularity ||
|
||||
!os["immediate_commit"].is_null() &&
|
||||
etcd_state_client_t::parse_immediate_commit(os["immediate_commit"].string_value(), IMMEDIATE_NONE) < immediate_commit)
|
||||
etcd_state_client_t::parse_immediate_commit(os["immediate_commit"].string_value()) < p_immediate_commit)
|
||||
{
|
||||
accepted_nodes.erase(osd_num);
|
||||
}
|
||||
|
@@ -214,10 +214,10 @@ resume_1:
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/pgstats/"
|
||||
parent->cli->st_cli.etcd_prefix+"/pg/stats/"
|
||||
) },
|
||||
{ "range_end", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/pgstats0"
|
||||
parent->cli->st_cli.etcd_prefix+"/pg/stats0"
|
||||
) },
|
||||
} },
|
||||
},
|
||||
@@ -235,7 +235,7 @@ resume_1:
|
||||
}
|
||||
// Calculate recovery percent
|
||||
std::map<pool_id_t, object_counts_t> counts;
|
||||
parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pgstats/",
|
||||
parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pg/stats/",
|
||||
[&](pool_id_t pool_id, uint64_t pg_num, json11::Json value)
|
||||
{
|
||||
auto & cnt = counts[pool_id];
|
||||
|
@@ -25,7 +25,6 @@ struct rm_inode_t
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
uint64_t max_offset = 0;
|
||||
bool down_ok = false;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
@@ -53,7 +52,7 @@ struct rm_inode_t
|
||||
.obj_done = 0,
|
||||
.synced = parent->cli->get_immediate_commit(inode),
|
||||
});
|
||||
if (min_offset == 0 && max_offset == 0)
|
||||
if (min_offset == 0)
|
||||
{
|
||||
total_count += objects.size();
|
||||
}
|
||||
@@ -61,7 +60,7 @@ struct rm_inode_t
|
||||
{
|
||||
for (object_id oid: objects)
|
||||
{
|
||||
if (oid.stripe >= min_offset && (!max_offset || oid.stripe < max_offset))
|
||||
if (oid.stripe >= min_offset)
|
||||
{
|
||||
total_count++;
|
||||
}
|
||||
@@ -117,7 +116,7 @@ struct rm_inode_t
|
||||
}
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
if (cur_list->obj_pos->stripe >= min_offset && (!max_offset || cur_list->obj_pos->stripe < max_offset))
|
||||
if (cur_list->obj_pos->stripe >= min_offset)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
@@ -288,7 +287,6 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
|
||||
remover->down_ok = cfg["down_ok"].bool_value();
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
remover->min_offset = cfg["min_offset"].uint64_value();
|
||||
remover->max_offset = cfg["max_offset"].uint64_value();
|
||||
return [remover](cli_result_t & result)
|
||||
{
|
||||
remover->loop();
|
||||
|
@@ -176,7 +176,7 @@ struct rm_osd_t
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/pg/config"
|
||||
parent->cli->st_cli.etcd_prefix+"/config/pgs"
|
||||
) },
|
||||
} },
|
||||
},
|
||||
@@ -229,7 +229,7 @@ struct rm_osd_t
|
||||
}
|
||||
if (!new_pgs.is_null())
|
||||
{
|
||||
auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/pg/config");
|
||||
auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pgs");
|
||||
rm_items.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", pgs_key },
|
||||
@@ -427,7 +427,7 @@ struct rm_osd_t
|
||||
{ "target", "MOD" },
|
||||
{ "key", history_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", parent->cli->st_cli.etcd_watch_revision_pg+1 },
|
||||
{ "mod_revision", parent->cli->st_cli.etcd_watch_revision+1 },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include "str_util.h"
|
||||
|
||||
static const char *help_text =
|
||||
"Vitastor disk management tool " VITASTOR_VERSION "\n"
|
||||
"Vitastor disk management tool " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"COMMANDS:\n"
|
||||
|
@@ -383,7 +383,7 @@ int disk_tool_t::pre_exec_osd(std::string device)
|
||||
|
||||
int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
|
||||
{
|
||||
std::set<uint64_t> osd_numbers;
|
||||
std::vector<uint64_t> osd_numbers;
|
||||
json11::Json::array superblocks;
|
||||
for (auto & device: devices)
|
||||
{
|
||||
@@ -391,11 +391,8 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
|
||||
if (!sb.is_null())
|
||||
{
|
||||
uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
|
||||
if (osd_numbers.find(osd_num) == osd_numbers.end())
|
||||
{
|
||||
osd_numbers.insert(osd_num);
|
||||
superblocks.push_back(sb);
|
||||
}
|
||||
osd_numbers.push_back(osd_num);
|
||||
superblocks.push_back(sb);
|
||||
}
|
||||
}
|
||||
if (!osd_numbers.size())
|
||||
|
@@ -10,7 +10,7 @@ set_target_properties(vitastor_kv PROPERTIES PUBLIC_HEADER "kv/vitastor_kv.h")
|
||||
target_link_libraries(vitastor_kv
|
||||
vitastor_client
|
||||
)
|
||||
set_target_properties(vitastor_kv PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
|
||||
set_target_properties(vitastor_kv PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||
|
||||
# vitastor-kv
|
||||
add_executable(vitastor-kv
|
||||
|
@@ -25,7 +25,7 @@ public:
|
||||
std::map<std::string, std::string> cfg;
|
||||
std::vector<std::string> cli_cmd;
|
||||
|
||||
vitastorkv_dbw_t *db = NULL;
|
||||
kv_dbw_t *db = NULL;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
@@ -144,7 +144,7 @@ void kv_cli_t::run()
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
db = new vitastorkv_dbw_t(cli);
|
||||
db = new kv_dbw_t(cli);
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
@@ -185,7 +185,7 @@ void kv_cli_t::run()
|
||||
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
|
||||
try
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
|
||||
epmgr->tfd->set_fd_handler(0, EPOLLIN, [this](int fd, int events)
|
||||
{
|
||||
if (events & EPOLLIN)
|
||||
{
|
||||
@@ -193,7 +193,7 @@ void kv_cli_t::run()
|
||||
}
|
||||
if (events & EPOLLRDHUP)
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, NULL);
|
||||
epmgr->tfd->set_fd_handler(0, 0, NULL);
|
||||
finished = true;
|
||||
}
|
||||
});
|
||||
@@ -289,7 +289,7 @@ void kv_cli_t::next_cmd()
|
||||
|
||||
struct kv_cli_list_t
|
||||
{
|
||||
vitastorkv_dbw_t *db = NULL;
|
||||
kv_dbw_t *db = NULL;
|
||||
void *handle = NULL;
|
||||
int format = 0;
|
||||
int n = 0;
|
||||
|
@@ -501,7 +501,7 @@ void kv_block_t::dump(int base_level)
|
||||
|
||||
void kv_db_t::open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb)
|
||||
{
|
||||
if (block_cache.size() > 0 || this->inode_id)
|
||||
if (block_cache.size() > 0)
|
||||
{
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
@@ -1958,38 +1958,38 @@ void kv_op_t::next_go_up()
|
||||
}
|
||||
}
|
||||
|
||||
vitastorkv_dbw_t::vitastorkv_dbw_t(cluster_client_t *cli)
|
||||
kv_dbw_t::kv_dbw_t(cluster_client_t *cli)
|
||||
{
|
||||
db = new kv_db_t();
|
||||
db->cli = cli;
|
||||
}
|
||||
|
||||
vitastorkv_dbw_t::~vitastorkv_dbw_t()
|
||||
kv_dbw_t::~kv_dbw_t()
|
||||
{
|
||||
delete db;
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::open(uint64_t inode_id, std::map<std::string, std::string> cfg, std::function<void(int)> cb)
|
||||
void kv_dbw_t::open(uint64_t inode_id, std::map<std::string, std::string> cfg, std::function<void(int)> cb)
|
||||
{
|
||||
db->open(inode_id, cfg, cb);
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::set_config(std::map<std::string, std::string> cfg)
|
||||
void kv_dbw_t::set_config(std::map<std::string, std::string> cfg)
|
||||
{
|
||||
db->set_config(cfg);
|
||||
}
|
||||
|
||||
uint64_t vitastorkv_dbw_t::get_size()
|
||||
uint64_t kv_dbw_t::get_size()
|
||||
{
|
||||
return db->next_free;
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::close(std::function<void()> cb)
|
||||
void kv_dbw_t::close(std::function<void()> cb)
|
||||
{
|
||||
db->close(cb);
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::get(const std::string & key, std::function<void(int res, const std::string & value)> cb, bool cached)
|
||||
void kv_dbw_t::get(const std::string & key, std::function<void(int res, const std::string & value)> cb, bool cached)
|
||||
{
|
||||
auto *op = new kv_op_t;
|
||||
op->db = db;
|
||||
@@ -2003,7 +2003,7 @@ void vitastorkv_dbw_t::get(const std::string & key, std::function<void(int res,
|
||||
op->exec();
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
|
||||
void kv_dbw_t::set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
|
||||
std::function<bool(int res, const std::string & value)> cas_compare)
|
||||
{
|
||||
auto *op = new kv_op_t;
|
||||
@@ -2023,7 +2023,7 @@ void vitastorkv_dbw_t::set(const std::string & key, const std::string & value, s
|
||||
op->exec();
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::del(const std::string & key, std::function<void(int res)> cb,
|
||||
void kv_dbw_t::del(const std::string & key, std::function<void(int res)> cb,
|
||||
std::function<bool(int res, const std::string & value)> cas_compare)
|
||||
{
|
||||
auto *op = new kv_op_t;
|
||||
@@ -2042,7 +2042,7 @@ void vitastorkv_dbw_t::del(const std::string & key, std::function<void(int res)>
|
||||
op->exec();
|
||||
}
|
||||
|
||||
void* vitastorkv_dbw_t::list_start(const std::string & start)
|
||||
void* kv_dbw_t::list_start(const std::string & start)
|
||||
{
|
||||
if (!db->inode_id || db->closing)
|
||||
return NULL;
|
||||
@@ -2055,7 +2055,7 @@ void* vitastorkv_dbw_t::list_start(const std::string & start)
|
||||
return op;
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb)
|
||||
void kv_dbw_t::list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb)
|
||||
{
|
||||
kv_op_t *op = (kv_op_t*)handle;
|
||||
if (cb)
|
||||
@@ -2068,7 +2068,7 @@ void vitastorkv_dbw_t::list_next(void *handle, std::function<void(int res, const
|
||||
op->next();
|
||||
}
|
||||
|
||||
void vitastorkv_dbw_t::list_close(void *handle)
|
||||
void kv_dbw_t::list_close(void *handle)
|
||||
{
|
||||
kv_op_t *op = (kv_op_t*)handle;
|
||||
delete op;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user