Compare commits
53 Commits
Author | SHA1 | Date |
---|---|---|
|
dfde0e60f0 | |
|
013f688ffe | |
|
cf9738ddbe | |
|
891b2811c7 | |
|
01590df6da | |
|
3e5f0be52c | |
|
58af897e73 | |
|
dbf9ecd171 | |
|
8508e78288 | |
|
f32dea02bf | |
|
a103065d12 | |
|
5d2e28d4a9 | |
|
18e14eed11 | |
|
ccc32b9e68 | |
|
ebaf3fee79 | |
|
196d28e987 | |
|
8f243b2328 | |
|
7a835fcd8f | |
|
8b0389b4e8 | |
|
f544c350ba | |
|
4eafb55b5c | |
|
5030396f71 | |
|
be22c363ca | |
|
0f80c87b43 | |
|
e0953fd502 | |
|
6e0ae47938 | |
|
b8f19e85ad | |
|
b7636e595f | |
|
48c026bfa0 | |
|
a73b2a26b6 | |
|
f3192b610d | |
|
a950889976 | |
|
ef5194d93c | |
|
f904576ab1 | |
|
4f9b1f2f62 | |
|
1d94afbd51 | |
|
3634f005f1 | |
|
263a3b5ad6 | |
|
b760951aa7 | |
|
c8321b8ed1 | |
|
21066a095b | |
|
a96900b696 | |
|
8a6e461322 | |
|
0b6a0463a4 | |
|
35d4047f46 | |
|
819f1125ae | |
|
108df7329f | |
|
d32edf6cdf | |
|
dca436d7e6 | |
|
8129a0b4e3 | |
|
704c87d512 | |
|
10216a5fb5 | |
|
3932eb7ff6 |
docker
etc
apt/sources.list.d
systemd/system
vitastor
docs
node-binding
src
disk_tool
|
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VITASTOR_VERSION "1.11.0")
|
set(VITASTOR_VERSION "2.1.0")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
Вернём былую скорость кластерному блочному хранилищу!
|
Вернём былую скорость кластерному блочному хранилищу!
|
||||||
|
|
||||||
Vitastor - распределённая блочная и файловая SDS (программная СХД), прямой аналог Ceph RBD и CephFS,
|
Vitastor - распределённая блочная, файловая и объектная SDS (программная СХД), прямой аналог Ceph RBD, CephFS и RGW,
|
||||||
а также внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
|
а также внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
|
||||||
быстрый и при этом простой. Только пока маленький :-).
|
быстрый и при этом простой. Только пока маленький :-).
|
||||||
|
|
||||||
|
@ -46,6 +46,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
|
||||||
- [OpenNebula](docs/installation/opennebula.ru.md)
|
- [OpenNebula](docs/installation/opennebula.ru.md)
|
||||||
- [OpenStack](docs/installation/openstack.ru.md)
|
- [OpenStack](docs/installation/openstack.ru.md)
|
||||||
- [Kubernetes CSI](docs/installation/kubernetes.ru.md)
|
- [Kubernetes CSI](docs/installation/kubernetes.ru.md)
|
||||||
|
- [S3](docs/installation/s3.ru.md)
|
||||||
- [Сборка из исходных кодов](docs/installation/source.ru.md)
|
- [Сборка из исходных кодов](docs/installation/source.ru.md)
|
||||||
- Конфигурация
|
- Конфигурация
|
||||||
- [Обзор](docs/config.ru.md)
|
- [Обзор](docs/config.ru.md)
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
Make Clustered Block Storage Fast Again.
|
Make Clustered Block Storage Fast Again.
|
||||||
|
|
||||||
Vitastor is a distributed block and file SDS, direct replacement of Ceph RBD and CephFS,
|
Vitastor is a distributed block, file and object SDS, direct replacement of Ceph RBD, CephFS and RGW,
|
||||||
and also internal SDS's of public clouds. However, in contrast to them, Vitastor is fast
|
and also internal SDS's of public clouds. However, in contrast to them, Vitastor is fast
|
||||||
and simple at the same time. The only thing is it's slightly young :-).
|
and simple at the same time. The only thing is it's slightly young :-).
|
||||||
|
|
||||||
|
@ -46,6 +46,7 @@ Read more details in the documentation. You can start from here: [Quick Start](d
|
||||||
- [OpenNebula](docs/installation/opennebula.en.md)
|
- [OpenNebula](docs/installation/opennebula.en.md)
|
||||||
- [OpenStack](docs/installation/openstack.en.md)
|
- [OpenStack](docs/installation/openstack.en.md)
|
||||||
- [Kubernetes CSI](docs/installation/kubernetes.en.md)
|
- [Kubernetes CSI](docs/installation/kubernetes.en.md)
|
||||||
|
- [S3](docs/installation/s3.en.md)
|
||||||
- [Building from Source](docs/installation/source.en.md)
|
- [Building from Source](docs/installation/source.en.md)
|
||||||
- Configuration
|
- Configuration
|
||||||
- [Overview](docs/config.en.md)
|
- [Overview](docs/config.en.md)
|
||||||
|
|
|
@ -37,8 +37,8 @@ RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/
|
||||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y vitastor-client && \
|
apt-get install -y vitastor-client && \
|
||||||
wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-utils_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
|
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-utils_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
|
||||||
wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-block-extra_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
|
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-block-extra_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
|
||||||
dpkg -x qemu-utils*.deb tmp1 && \
|
dpkg -x qemu-utils*.deb tmp1 && \
|
||||||
dpkg -x qemu-block-extra*.deb tmp1 && \
|
dpkg -x qemu-block-extra*.deb tmp1 && \
|
||||||
cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
|
cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
VITASTOR_VERSION ?= v1.11.0
|
VITASTOR_VERSION ?= v2.1.0
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ spec:
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v1.11.0
|
image: vitalif/vitastor-csi:v2.1.0
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -121,7 +121,7 @@ spec:
|
||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v1.11.0
|
image: vitalif/vitastor-csi:v2.1.0
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -5,7 +5,7 @@ package vitastor
|
||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "1.11.0"
|
vitastorCSIDriverVersion = "2.1.0"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
vitastor (1.11.0-1) unstable; urgency=medium
|
vitastor (2.1.0-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,10 @@ Source: vitastor
|
||||||
Section: admin
|
Section: admin
|
||||||
Priority: optional
|
Priority: optional
|
||||||
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev
|
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8),
|
||||||
|
linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev,
|
||||||
|
libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
|
||||||
|
node-bindings <!nocheck>, node-gyp, node-nan
|
||||||
Standards-Version: 4.5.0
|
Standards-Version: 4.5.0
|
||||||
Homepage: https://vitastor.io/
|
Homepage: https://vitastor.io/
|
||||||
Rules-Requires-Root: no
|
Rules-Requires-Root: no
|
||||||
|
@ -59,3 +62,9 @@ Architecture: amd64
|
||||||
Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client, patch, python3, jq
|
Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client, patch, python3, jq
|
||||||
Description: Vitastor OpenNebula storage plugin
|
Description: Vitastor OpenNebula storage plugin
|
||||||
Vitastor storage plugin for OpenNebula.
|
Vitastor storage plugin for OpenNebula.
|
||||||
|
|
||||||
|
Package: node-vitastor
|
||||||
|
Architecture: amd64
|
||||||
|
Depends: ${shlibs:Depends}, ${misc:Depends}, node-bindings
|
||||||
|
Description: Node.js bindings for Vitastor client
|
||||||
|
Node.js native bindings for the Vitastor client library (vitastor-client).
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
usr/lib/x86_64-linux-gnu/nodejs/vitastor
|
|
@ -10,10 +10,14 @@ ARG REL=
|
||||||
WORKDIR /root
|
WORKDIR /root
|
||||||
|
|
||||||
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
|
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
|
||||||
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
|
if [ "$REL" = "buster" ]; then \
|
||||||
|
echo "deb http://archive.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
|
||||||
|
else \
|
||||||
|
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
|
||||||
|
fi; \
|
||||||
echo >> /etc/apt/preferences; \
|
echo >> /etc/apt/preferences; \
|
||||||
echo 'Package: *' >> /etc/apt/preferences; \
|
echo 'Package: *' >> /etc/apt/preferences; \
|
||||||
echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
|
echo "Pin: release n=$REL-backports" >> /etc/apt/preferences; \
|
||||||
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
|
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
|
||||||
fi; \
|
fi; \
|
||||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
|
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
|
||||||
|
@ -56,7 +60,7 @@ RUN set -e; \
|
||||||
quilt add block/vitastor.c; \
|
quilt add block/vitastor.c; \
|
||||||
cp /root/qemu_driver.c block/vitastor.c; \
|
cp /root/qemu_driver.c block/vitastor.c; \
|
||||||
quilt refresh; \
|
quilt refresh; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
|
V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(\+deb\d+u\d+)?(~bpo[\d\+]*)?\).*$/$1/')+vitastor5; \
|
||||||
if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
|
if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
|
||||||
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
|
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
|
|
@ -4,6 +4,14 @@ export DH_VERBOSE = 1
|
||||||
%:
|
%:
|
||||||
dh $@
|
dh $@
|
||||||
|
|
||||||
|
override_dh_install:
|
||||||
|
perl -pe 's!prefix=/usr!prefix='`pwd`'/debian/tmp/usr!' < obj-x86_64-linux-gnu/src/client/vitastor.pc > node-binding/vitastor.pc
|
||||||
|
cd node-binding && PKG_CONFIG_PATH=./ PKG_CONFIG_ALLOW_SYSTEM_CFLAGS=1 npm install --unsafe-perm || exit 1
|
||||||
|
mkdir -p debian/tmp/usr/lib/x86_64-linux-gnu/nodejs/vitastor/build/Release
|
||||||
|
cp -v node-binding/package.json node-binding/index.js node-binding/addon.cc node-binding/addon.h node-binding/client.cc node-binding/client.h debian/tmp/usr/lib/x86_64-linux-gnu/nodejs/vitastor
|
||||||
|
cp -v node-binding/build/Release/addon.node debian/tmp/usr/lib/x86_64-linux-gnu/nodejs/vitastor/build/Release
|
||||||
|
dh_install
|
||||||
|
|
||||||
override_dh_installdeb:
|
override_dh_installdeb:
|
||||||
cat debian/fio_version >> debian/vitastor-fio.substvars
|
cat debian/fio_version >> debian/vitastor-fio.substvars
|
||||||
[ -f debian/qemu_version ] && (cat debian/qemu_version >> debian/vitastor-qemu.substvars) || true
|
[ -f debian/qemu_version ] && (cat debian/qemu_version >> debian/vitastor-qemu.substvars) || true
|
||||||
|
|
|
@ -22,7 +22,8 @@ RUN set -e -x; \
|
||||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl && \
|
apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake \
|
||||||
|
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
|
||||||
apt-get -y build-dep fio && \
|
apt-get -y build-dep fio && \
|
||||||
apt-get --download-only source fio
|
apt-get --download-only source fio
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
FROM debian:bookworm
|
FROM debian:bookworm
|
||||||
|
|
||||||
ADD etc/apt /etc/apt/
|
ADD etc/apt /etc/apt/
|
||||||
RUN apt-get update && apt-get -y install vitastor qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
|
RUN apt-get update && apt-get -y install vitastor udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
|
||||||
ADD sleep.sh /usr/bin/
|
ADD sleep.sh /usr/bin/
|
||||||
ADD install.sh /usr/bin/
|
ADD install.sh /usr/bin/
|
||||||
ADD scripts /opt/scripts/
|
ADD scripts /opt/scripts/
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
VITASTOR_VERSION ?= v1.11.0
|
VITASTOR_VERSION ?= v2.1.0
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
build:
|
build:
|
||||||
@docker build --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
|
@docker build --no-cache --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
|
||||||
|
|
||||||
push:
|
push:
|
||||||
@docker push vitalif/vitastor:$(VITASTOR_VERSION)
|
@docker push vitalif/vitastor:$(VITASTOR_VERSION)
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
deb http://vitastor.io/debian bookworm main
|
deb http://vitastor.io/debian bookworm main
|
||||||
|
deb http://http.debian.net/debian/ bookworm-backports main
|
||||||
|
|
|
@ -7,8 +7,8 @@ PartOf=vitastor.target
|
||||||
[Service]
|
[Service]
|
||||||
Restart=always
|
Restart=always
|
||||||
EnvironmentFile=/etc/vitastor/docker.conf
|
EnvironmentFile=/etc/vitastor/docker.conf
|
||||||
ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev \
|
ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev -v /run:/run \
|
||||||
--privileged --log-driver none --network host --name vitastor vitastor:$VITASTOR_VERSION \
|
--security-opt seccomp=unconfined --privileged --pid=host --log-driver none --network host --name vitastor vitastor:$VITASTOR_VERSION \
|
||||||
sleep.sh'
|
sleep.sh'
|
||||||
ExecStartPost=udevadm trigger
|
ExecStartPost=udevadm trigger
|
||||||
ExecStop=docker stop vitastor
|
ExecStop=docker stop vitastor
|
||||||
|
|
|
@ -12,7 +12,8 @@ EnvironmentFile=/etc/vitastor/docker.conf
|
||||||
SyslogIdentifier=vitastor-osd%i
|
SyslogIdentifier=vitastor-osd%i
|
||||||
ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev \
|
ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev \
|
||||||
$(for i in $(ls /dev/vitastor/osd%i-*); do echo --device $i:$i; done) \
|
$(for i in $(ls /dev/vitastor/osd%i-*); do echo --device $i:$i; done) \
|
||||||
--log-driver none --network host --ulimit nofile=1048576 --ulimit memlock=-1 $CONTAINER_OPTIONS --name vitastor-osd%i \
|
--log-driver none --network host --ulimit nofile=1048576 --ulimit memlock=-1 \
|
||||||
|
--security-opt seccomp=unconfined $CONTAINER_OPTIONS --name vitastor-osd%i \
|
||||||
vitastor:$VITASTOR_VERSION vitastor-disk exec-osd /dev/vitastor/osd%i-data'
|
vitastor:$VITASTOR_VERSION vitastor-disk exec-osd /dev/vitastor/osd%i-data'
|
||||||
ExecStartPre=+docker exec vitastor vitastor-disk pre-exec /dev/vitastor/osd%i-data
|
ExecStartPre=+docker exec vitastor vitastor-disk pre-exec /dev/vitastor/osd%i-data
|
||||||
ExecStop=docker stop vitastor-etcd%i
|
ExecStop=docker stop vitastor-etcd%i
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# Desired Vitastor version
|
# Desired Vitastor version
|
||||||
VITASTOR_VERSION=1.11.0
|
VITASTOR_VERSION=v2.1.0
|
||||||
|
|
||||||
# Additional arguments for all containers
|
# Additional arguments for all containers
|
||||||
# For example, you may want to specify a custom logging driver here
|
# For example, you may want to specify a custom logging driver here
|
||||||
|
|
|
@ -13,6 +13,7 @@ affect their interaction with the cluster.
|
||||||
- [client_retry_interval](#client_retry_interval)
|
- [client_retry_interval](#client_retry_interval)
|
||||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||||
- [client_retry_enospc](#client_retry_enospc)
|
- [client_retry_enospc](#client_retry_enospc)
|
||||||
|
- [client_wait_up_timeout](#client_wait_up_timeout)
|
||||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||||
- [client_enable_writeback](#client_enable_writeback)
|
- [client_enable_writeback](#client_enable_writeback)
|
||||||
|
@ -70,6 +71,19 @@ and clients are not blocked and just get EIO error code instead.
|
||||||
Retry writes on out of space errors to wait until some space is freed on
|
Retry writes on out of space errors to wait until some space is freed on
|
||||||
OSDs.
|
OSDs.
|
||||||
|
|
||||||
|
## client_wait_up_timeout
|
||||||
|
|
||||||
|
- Type: seconds
|
||||||
|
- Default: 16
|
||||||
|
- Can be changed online: yes
|
||||||
|
|
||||||
|
Wait for this number of seconds until PGs are up when doing operations
|
||||||
|
which require all PGs to be up. Currently only used by object listings
|
||||||
|
in delete and merge-based commands ([vitastor-cli rm](../usage/cli.en.md#rm), merge and so on).
|
||||||
|
|
||||||
|
The default value is calculated as `1 + OSD lease timeout`, which is
|
||||||
|
`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
|
||||||
|
|
||||||
## client_max_dirty_bytes
|
## client_max_dirty_bytes
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
- [client_retry_interval](#client_retry_interval)
|
- [client_retry_interval](#client_retry_interval)
|
||||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||||
- [client_retry_enospc](#client_retry_enospc)
|
- [client_retry_enospc](#client_retry_enospc)
|
||||||
|
- [client_wait_up_timeout](#client_wait_up_timeout)
|
||||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||||
- [client_enable_writeback](#client_enable_writeback)
|
- [client_enable_writeback](#client_enable_writeback)
|
||||||
|
@ -72,6 +73,19 @@ RDMA и хотите повысить пиковую производитель
|
||||||
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
|
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
|
||||||
ожидать, пока на OSD не освободится место.
|
ожидать, пока на OSD не освободится место.
|
||||||
|
|
||||||
|
## client_wait_up_timeout
|
||||||
|
|
||||||
|
- Тип: секунды
|
||||||
|
- Значение по умолчанию: 16
|
||||||
|
- Можно менять на лету: да
|
||||||
|
|
||||||
|
Время ожидания поднятия PG при операциях, требующих активности всех PG.
|
||||||
|
В данный момент используется листингами объектов в командах, использующих
|
||||||
|
удаление и слияние ([vitastor-cli rm](../usage/cli.ru.md#rm), merge и подобные).
|
||||||
|
|
||||||
|
Значение по умолчанию вычисляется как `1 + время lease OSD`, равное
|
||||||
|
`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
|
||||||
|
|
||||||
## client_max_dirty_bytes
|
## client_max_dirty_bytes
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
|
|
|
@ -74,13 +74,13 @@ Grafana dashboard suitable for this exporter is here: [Vitastor-Grafana-6+.json]
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 8060
|
- Default: 8060
|
||||||
|
|
||||||
HTTP port for monitors to listen on (including metrics exporter)
|
HTTP port for monitors to listen to (including metrics exporter)
|
||||||
|
|
||||||
## mon_http_ip
|
## mon_http_ip
|
||||||
|
|
||||||
- Type: string
|
- Type: string
|
||||||
|
|
||||||
IP address for monitors to listen on (all addresses by default)
|
IP address for monitors to listen to (all addresses by default)
|
||||||
|
|
||||||
## mon_https_cert
|
## mon_https_cert
|
||||||
|
|
||||||
|
|
|
@ -9,9 +9,11 @@
|
||||||
These parameters apply to clients and OSDs and affect network connection logic
|
These parameters apply to clients and OSDs and affect network connection logic
|
||||||
between clients, OSDs and etcd.
|
between clients, OSDs and etcd.
|
||||||
|
|
||||||
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
- [osd_network](#osd_network)
|
||||||
- [use_sync_send_recv](#use_sync_send_recv)
|
- [osd_cluster_network](#osd_cluster_network)
|
||||||
- [use_rdma](#use_rdma)
|
- [use_rdma](#use_rdma)
|
||||||
|
- [use_rdmacm](#use_rdmacm)
|
||||||
|
- [disable_tcp](#disable_tcp)
|
||||||
- [rdma_device](#rdma_device)
|
- [rdma_device](#rdma_device)
|
||||||
- [rdma_port_num](#rdma_port_num)
|
- [rdma_port_num](#rdma_port_num)
|
||||||
- [rdma_gid_index](#rdma_gid_index)
|
- [rdma_gid_index](#rdma_gid_index)
|
||||||
|
@ -30,38 +32,62 @@ between clients, OSDs and etcd.
|
||||||
- [etcd_slow_timeout](#etcd_slow_timeout)
|
- [etcd_slow_timeout](#etcd_slow_timeout)
|
||||||
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
|
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
|
||||||
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
||||||
|
- [etcd_min_reload_interval](#etcd_min_reload_interval)
|
||||||
|
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
||||||
|
- [use_sync_send_recv](#use_sync_send_recv)
|
||||||
|
|
||||||
## tcp_header_buffer_size
|
## osd_network
|
||||||
|
|
||||||
- Type: integer
|
- Type: string or array of strings
|
||||||
- Default: 65536
|
|
||||||
|
|
||||||
Size of the buffer used to read data using an additional copy. Vitastor
|
Network mask of public OSD network(s) (IPv4 or IPv6). Each OSD listens to all
|
||||||
packet headers are 128 bytes, payload is always at least 4 KB, so it is
|
addresses of UP + RUNNING interfaces matching one of these networks, on the
|
||||||
usually beneficial to try to read multiple packets at once even though
|
same port. Port is auto-selected except if [bind_port](osd.en.md#bind_port) is
|
||||||
it requires to copy the data an additional time. The rest of each packet
|
explicitly specified. Bind address(es) may also be overridden manually by
|
||||||
is received without an additional copy. You can try to play with this
|
specifying [bind_address](osd.en.md#bind_address). If OSD networks are not specified
|
||||||
parameter and see how it affects random iops and linear bandwidth if you
|
at all, OSD just listens to a wildcard address (0.0.0.0).
|
||||||
want.
|
|
||||||
|
|
||||||
## use_sync_send_recv
|
## osd_cluster_network
|
||||||
|
|
||||||
- Type: boolean
|
- Type: string or array of strings
|
||||||
- Default: false
|
|
||||||
|
|
||||||
If true, synchronous send/recv syscalls are used instead of io_uring for
|
Network mask of separate network(s) (IPv4 or IPv6) to use for OSD
|
||||||
socket communication. Useless for OSDs because they require io_uring anyway,
|
cluster connections. I.e. OSDs will always attempt to use these networks
|
||||||
but may be required for clients with old kernel versions.
|
to connect to other OSDs, while clients will attempt to use networks from
|
||||||
|
[osd_network](#osd_network).
|
||||||
|
|
||||||
## use_rdma
|
## use_rdma
|
||||||
|
|
||||||
- Type: boolean
|
- Type: boolean
|
||||||
- Default: true
|
- Default: true
|
||||||
|
|
||||||
Try to use RDMA for communication if it's available. Disable if you don't
|
Try to use RDMA through libibverbs for communication if it's available.
|
||||||
want Vitastor to use RDMA. TCP-only clients can also talk to an RDMA-enabled
|
Disable if you don't want Vitastor to use RDMA. TCP-only clients can also
|
||||||
cluster, so disabling RDMA may be needed if clients have RDMA devices,
|
talk to an RDMA-enabled cluster, so disabling RDMA may be needed if clients
|
||||||
but they are not connected to the cluster.
|
have RDMA devices, but they are not connected to the cluster.
|
||||||
|
|
||||||
|
`use_rdma` works with RoCEv1/RoCEv2 networks, but not with iWARP and,
|
||||||
|
maybe, with some Infiniband configurations which require RDMA-CM.
|
||||||
|
Consider `use_rdmacm` for such networks.
|
||||||
|
|
||||||
|
## use_rdmacm
|
||||||
|
|
||||||
|
- Type: boolean
|
||||||
|
- Default: true
|
||||||
|
|
||||||
|
Use an alternative implementation of RDMA through RDMA-CM (Connection
|
||||||
|
Manager). Works with all RDMA networks: Infiniband, iWARP and
|
||||||
|
RoCEv1/RoCEv2, and even allows to disable TCP and run only with RDMA.
|
||||||
|
OSDs always use random port numbers for RDMA-CM listeners, different
|
||||||
|
from their TCP ports. `use_rdma` is automatically disabled when
|
||||||
|
`use_rdmacm` is enabled.
|
||||||
|
|
||||||
|
## disable_tcp
|
||||||
|
|
||||||
|
- Type: boolean
|
||||||
|
- Default: true
|
||||||
|
|
||||||
|
Fully disable TCP and only use RDMA-CM for OSD communication.
|
||||||
|
|
||||||
## rdma_device
|
## rdma_device
|
||||||
|
|
||||||
|
@ -92,12 +118,13 @@ PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 1
|
|
||||||
|
|
||||||
RDMA device port number to use. Only for devices that have more than 1 port.
|
RDMA device port number to use. Only for devices that have more than 1 port.
|
||||||
See `phys_port_cnt` in `ibv_devinfo -v` output to determine how many ports
|
See `phys_port_cnt` in `ibv_devinfo -v` output to determine how many ports
|
||||||
your device has.
|
your device has.
|
||||||
|
|
||||||
|
Not relevant for RDMA-CM (use_rdmacm).
|
||||||
|
|
||||||
## rdma_gid_index
|
## rdma_gid_index
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
|
@ -113,13 +140,14 @@ GID auto-selection is unsupported with libibverbs < v32.
|
||||||
|
|
||||||
A correct rdma_gid_index for RoCEv2 is usually 1 (IPv6) or 3 (IPv4).
|
A correct rdma_gid_index for RoCEv2 is usually 1 (IPv6) or 3 (IPv4).
|
||||||
|
|
||||||
|
Not relevant for RDMA-CM (use_rdmacm).
|
||||||
|
|
||||||
## rdma_mtu
|
## rdma_mtu
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 4096
|
|
||||||
|
|
||||||
RDMA Path MTU to use. Must be 1024, 2048 or 4096. There is usually no
|
RDMA Path MTU to use. Must be 1024, 2048 or 4096. Default is to use the
|
||||||
sense to change it from the default 4096.
|
RDMA device's MTU.
|
||||||
|
|
||||||
## rdma_max_sge
|
## rdma_max_sge
|
||||||
|
|
||||||
|
@ -261,3 +289,35 @@ etcd_report_interval to guarantee that keepalive actually works.
|
||||||
|
|
||||||
etcd websocket ping interval required to keep the connection alive and
|
etcd websocket ping interval required to keep the connection alive and
|
||||||
detect disconnections quickly.
|
detect disconnections quickly.
|
||||||
|
|
||||||
|
## etcd_min_reload_interval
|
||||||
|
|
||||||
|
- Type: milliseconds
|
||||||
|
- Default: 1000
|
||||||
|
- Can be changed online: yes
|
||||||
|
|
||||||
|
Minimum interval for full etcd state reload. Introduced to prevent
|
||||||
|
excessive load on etcd during outages when etcd can't keep up with event
|
||||||
|
streams and cancels them.
|
||||||
|
|
||||||
|
## tcp_header_buffer_size
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 65536
|
||||||
|
|
||||||
|
Size of the buffer used to read data using an additional copy. Vitastor
|
||||||
|
packet headers are 128 bytes, payload is always at least 4 KB, so it is
|
||||||
|
usually beneficial to try to read multiple packets at once even though
|
||||||
|
it requires to copy the data an additional time. The rest of each packet
|
||||||
|
is received without an additional copy. You can try to play with this
|
||||||
|
parameter and see how it affects random iops and linear bandwidth if you
|
||||||
|
want.
|
||||||
|
|
||||||
|
## use_sync_send_recv
|
||||||
|
|
||||||
|
- Type: boolean
|
||||||
|
- Default: false
|
||||||
|
|
||||||
|
If true, synchronous send/recv syscalls are used instead of io_uring for
|
||||||
|
socket communication. Useless for OSDs because they require io_uring anyway,
|
||||||
|
but may be required for clients with old kernel versions.
|
||||||
|
|
|
@ -9,9 +9,11 @@
|
||||||
Данные параметры используются клиентами и OSD и влияют на логику сетевого
|
Данные параметры используются клиентами и OSD и влияют на логику сетевого
|
||||||
взаимодействия между клиентами, OSD, а также etcd.
|
взаимодействия между клиентами, OSD, а также etcd.
|
||||||
|
|
||||||
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
- [osd_network](#osd_network)
|
||||||
- [use_sync_send_recv](#use_sync_send_recv)
|
- [osd_cluster_network](#osd_cluster_network)
|
||||||
- [use_rdma](#use_rdma)
|
- [use_rdma](#use_rdma)
|
||||||
|
- [use_rdmacm](#use_rdmacm)
|
||||||
|
- [disable_tcp](#disable_tcp)
|
||||||
- [rdma_device](#rdma_device)
|
- [rdma_device](#rdma_device)
|
||||||
- [rdma_port_num](#rdma_port_num)
|
- [rdma_port_num](#rdma_port_num)
|
||||||
- [rdma_gid_index](#rdma_gid_index)
|
- [rdma_gid_index](#rdma_gid_index)
|
||||||
|
@ -30,41 +32,62 @@
|
||||||
- [etcd_slow_timeout](#etcd_slow_timeout)
|
- [etcd_slow_timeout](#etcd_slow_timeout)
|
||||||
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
|
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
|
||||||
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
||||||
|
- [etcd_min_reload_interval](#etcd_min_reload_interval)
|
||||||
|
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
||||||
|
- [use_sync_send_recv](#use_sync_send_recv)
|
||||||
|
|
||||||
## tcp_header_buffer_size
|
## osd_network
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: строка или массив строк
|
||||||
- Значение по умолчанию: 65536
|
|
||||||
|
|
||||||
Размер буфера для чтения данных с дополнительным копированием. Пакеты
|
Маски подсетей (IPv4 или IPv6) публичной сети или сетей OSD. Каждый OSD слушает
|
||||||
Vitastor содержат 128-байтные заголовки, за которыми следуют данные размером
|
один и тот же порт на всех адресах поднятых (UP + RUNNING) сетевых интерфейсов,
|
||||||
от 4 КБ и для мелких операций ввода-вывода обычно выгодно за 1 вызов читать
|
соответствующих одной из указанных сетей. Порт выбирается автоматически, если
|
||||||
сразу несколько пакетов, даже не смотря на то, что это требует лишний раз
|
только [bind_port](osd.ru.md#bind_port) не задан явно. Адреса для подключений можно
|
||||||
скопировать данные. Часть каждого пакета за пределами значения данного
|
также переопределить явно, задав [bind_address](osd.ru.md#bind_address). Если сети OSD
|
||||||
параметра читается без дополнительного копирования. Вы можете попробовать
|
не заданы вообще, OSD слушает все адреса (0.0.0.0).
|
||||||
поменять этот параметр и посмотреть, как он влияет на производительность
|
|
||||||
случайного и линейного доступа.
|
|
||||||
|
|
||||||
## use_sync_send_recv
|
## osd_cluster_network
|
||||||
|
|
||||||
- Тип: булево (да/нет)
|
- Тип: строка или массив строк
|
||||||
- Значение по умолчанию: false
|
|
||||||
|
|
||||||
Если установлено в истину, то вместо io_uring для передачи данных по сети
|
Маски подсетей (IPv4 или IPv6) отдельной кластерной сети или сетей OSD.
|
||||||
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
То есть, OSD будут всегда стараться использовать эти сети для соединений
|
||||||
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
с другими OSD, а клиенты будут стараться использовать сети из [osd_network](#osd_network).
|
||||||
принципе, это может применяться для клиентов со старыми версиями ядра.
|
|
||||||
|
|
||||||
## use_rdma
|
## use_rdma
|
||||||
|
|
||||||
- Тип: булево (да/нет)
|
- Тип: булево (да/нет)
|
||||||
- Значение по умолчанию: true
|
- Значение по умолчанию: true
|
||||||
|
|
||||||
Пытаться использовать RDMA для связи при наличии доступных устройств.
|
Попробовать использовать RDMA через libibverbs для связи при наличии
|
||||||
Отключите, если вы не хотите, чтобы Vitastor использовал RDMA.
|
доступных устройств. Отключите, если вы не хотите, чтобы Vitastor
|
||||||
TCP-клиенты также могут работать с RDMA-кластером, так что отключать
|
использовал RDMA. TCP-клиенты также могут работать с RDMA-кластером,
|
||||||
RDMA может быть нужно только если у клиентов есть RDMA-устройства,
|
так что отключать RDMA может быть нужно, только если у клиентов есть
|
||||||
но они не имеют соединения с кластером Vitastor.
|
RDMA-устройства, но они не имеют соединения с кластером Vitastor.
|
||||||
|
|
||||||
|
`use_rdma` работает с RoCEv1/RoCEv2 сетями, но не работает с iWARP и
|
||||||
|
может не работать с частью конфигураций Infiniband, требующих RDMA-CM.
|
||||||
|
Рассмотрите включение `use_rdmacm` для таких сетей.
|
||||||
|
|
||||||
|
## use_rdmacm
|
||||||
|
|
||||||
|
- Тип: булево (да/нет)
|
||||||
|
- Значение по умолчанию: true
|
||||||
|
|
||||||
|
Использовать альтернативную реализацию RDMA на основе RDMA-CM (Connection
|
||||||
|
Manager). Работает со всеми типами RDMA-сетей: Infiniband, iWARP и
|
||||||
|
RoCEv1/RoCEv2, и даже позволяет полностью отключить TCP и работать
|
||||||
|
только на RDMA. OSD используют случайные номера портов для ожидания
|
||||||
|
соединений через RDMA-CM, отличающиеся от их TCP-портов. Также при
|
||||||
|
включении `use_rdmacm` автоматически отключается опция `use_rdma`.
|
||||||
|
|
||||||
|
## disable_tcp
|
||||||
|
|
||||||
|
- Тип: булево (да/нет)
|
||||||
|
- Значение по умолчанию: true
|
||||||
|
|
||||||
|
Полностью отключить TCP и использовать только RDMA-CM для соединений с OSD.
|
||||||
|
|
||||||
## rdma_device
|
## rdma_device
|
||||||
|
|
||||||
|
@ -96,13 +119,14 @@ Control) и ECN (Explicit Congestion Notification).
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 1
|
|
||||||
|
|
||||||
Номер порта RDMA-устройства, который следует использовать. Имеет смысл
|
Номер порта RDMA-устройства, который следует использовать. Имеет смысл
|
||||||
только для устройств, у которых более 1 порта. Чтобы узнать, сколько портов
|
только для устройств, у которых более 1 порта. Чтобы узнать, сколько портов
|
||||||
у вашего адаптера, посмотрите `phys_port_cnt` в выводе команды
|
у вашего адаптера, посмотрите `phys_port_cnt` в выводе команды
|
||||||
`ibv_devinfo -v`.
|
`ibv_devinfo -v`.
|
||||||
|
|
||||||
|
Опция неприменима к RDMA-CM (use_rdmacm).
|
||||||
|
|
||||||
## rdma_gid_index
|
## rdma_gid_index
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
|
@ -119,13 +143,14 @@ libibverbs < v32.
|
||||||
|
|
||||||
Правильный rdma_gid_index для RoCEv2, как правило, 1 (IPv6) или 3 (IPv4).
|
Правильный rdma_gid_index для RoCEv2, как правило, 1 (IPv6) или 3 (IPv4).
|
||||||
|
|
||||||
|
Опция неприменима к RDMA-CM (use_rdmacm).
|
||||||
|
|
||||||
## rdma_mtu
|
## rdma_mtu
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 4096
|
|
||||||
|
|
||||||
Максимальная единица передачи (Path MTU) для RDMA. Должно быть равно 1024,
|
Максимальная единица передачи (Path MTU) для RDMA. Должно быть равно 1024,
|
||||||
2048 или 4096. Обычно нет смысла менять значение по умолчанию, равное 4096.
|
2048 или 4096. По умолчанию используется значение MTU RDMA-устройства.
|
||||||
|
|
||||||
## rdma_max_sge
|
## rdma_max_sge
|
||||||
|
|
||||||
|
@ -271,3 +296,37 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
|
||||||
- Можно менять на лету: да
|
- Можно менять на лету: да
|
||||||
|
|
||||||
Интервал проверки живости вебсокет-подключений к etcd.
|
Интервал проверки живости вебсокет-подключений к etcd.
|
||||||
|
|
||||||
|
## etcd_min_reload_interval
|
||||||
|
|
||||||
|
- Тип: миллисекунды
|
||||||
|
- Значение по умолчанию: 1000
|
||||||
|
- Можно менять на лету: да
|
||||||
|
|
||||||
|
Минимальный интервал полной перезагрузки состояния из etcd. Добавлено для
|
||||||
|
предотвращения избыточной нагрузки на etcd во время отказов, когда etcd не
|
||||||
|
успевает рассылать потоки событий и отменяет их.
|
||||||
|
|
||||||
|
## tcp_header_buffer_size
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 65536
|
||||||
|
|
||||||
|
Размер буфера для чтения данных с дополнительным копированием. Пакеты
|
||||||
|
Vitastor содержат 128-байтные заголовки, за которыми следуют данные размером
|
||||||
|
от 4 КБ и для мелких операций ввода-вывода обычно выгодно за 1 вызов читать
|
||||||
|
сразу несколько пакетов, даже не смотря на то, что это требует лишний раз
|
||||||
|
скопировать данные. Часть каждого пакета за пределами значения данного
|
||||||
|
параметра читается без дополнительного копирования. Вы можете попробовать
|
||||||
|
поменять этот параметр и посмотреть, как он влияет на производительность
|
||||||
|
случайного и линейного доступа.
|
||||||
|
|
||||||
|
## use_sync_send_recv
|
||||||
|
|
||||||
|
- Тип: булево (да/нет)
|
||||||
|
- Значение по умолчанию: false
|
||||||
|
|
||||||
|
Если установлено в истину, то вместо io_uring для передачи данных по сети
|
||||||
|
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
||||||
|
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
||||||
|
принципе, это может применяться для клиентов со старыми версиями ядра.
|
||||||
|
|
|
@ -7,16 +7,15 @@
|
||||||
# Runtime OSD Parameters
|
# Runtime OSD Parameters
|
||||||
|
|
||||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||||
initialization and can be changed - either with an OSD restart or, for some of
|
initialization and can be changed - in /etc/vitastor/vitastor.conf or [vitastor-disk update-sb](../usage/disk.en.md#update-sb)
|
||||||
them, even without restarting by updating configuration in etcd.
|
with an OSD restart or, for some of them, even without restarting by updating configuration in etcd.
|
||||||
|
|
||||||
|
- [bind_address](#bind_address)
|
||||||
|
- [bind_port](#bind_port)
|
||||||
- [osd_iothread_count](#osd_iothread_count)
|
- [osd_iothread_count](#osd_iothread_count)
|
||||||
- [etcd_report_interval](#etcd_report_interval)
|
- [etcd_report_interval](#etcd_report_interval)
|
||||||
- [etcd_stats_interval](#etcd_stats_interval)
|
- [etcd_stats_interval](#etcd_stats_interval)
|
||||||
- [run_primary](#run_primary)
|
- [run_primary](#run_primary)
|
||||||
- [osd_network](#osd_network)
|
|
||||||
- [bind_address](#bind_address)
|
|
||||||
- [bind_port](#bind_port)
|
|
||||||
- [autosync_interval](#autosync_interval)
|
- [autosync_interval](#autosync_interval)
|
||||||
- [autosync_writes](#autosync_writes)
|
- [autosync_writes](#autosync_writes)
|
||||||
- [recovery_queue_depth](#recovery_queue_depth)
|
- [recovery_queue_depth](#recovery_queue_depth)
|
||||||
|
@ -61,6 +60,26 @@ them, even without restarting by updating configuration in etcd.
|
||||||
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
||||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||||
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||||
|
- [discard_on_start](#discard_on_start)
|
||||||
|
- [min_discard_size](#min_discard_size)
|
||||||
|
- [allow_net_split](#allow_net_split)
|
||||||
|
|
||||||
|
## bind_address
|
||||||
|
|
||||||
|
- Type: string or array of strings
|
||||||
|
|
||||||
|
Instead of the network masks ([osd_network](network.en.md#osd_network) and
|
||||||
|
[osd_cluster_network](network.en.md#osd_cluster_network)), you can also set
|
||||||
|
OSD listen addresses explicitly using this parameter. May be useful if you
|
||||||
|
want to start OSDs on interfaces that are not UP + RUNNING.
|
||||||
|
|
||||||
|
## bind_port
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
|
||||||
|
By default, OSDs pick random ports to use for incoming connections
|
||||||
|
automatically. With this option you can set a specific port for a specific
|
||||||
|
OSD by hand.
|
||||||
|
|
||||||
## osd_iothread_count
|
## osd_iothread_count
|
||||||
|
|
||||||
|
@ -104,34 +123,6 @@ debugging purposes. It's possible to implement additional feature for the
|
||||||
monitor which may allow to separate primary and secondary OSDs, but it's
|
monitor which may allow to separate primary and secondary OSDs, but it's
|
||||||
unclear why anyone could need it, so it's not implemented.
|
unclear why anyone could need it, so it's not implemented.
|
||||||
|
|
||||||
## osd_network
|
|
||||||
|
|
||||||
- Type: string or array of strings
|
|
||||||
|
|
||||||
Network mask of the network (IPv4 or IPv6) to use for OSDs. Note that
|
|
||||||
although it's possible to specify multiple networks here, this does not
|
|
||||||
mean that OSDs will create multiple listening sockets - they'll only
|
|
||||||
pick the first matching address of an UP + RUNNING interface. Separate
|
|
||||||
networks for cluster and client connections are also not implemented, but
|
|
||||||
they are mostly useless anyway, so it's not a big deal.
|
|
||||||
|
|
||||||
## bind_address
|
|
||||||
|
|
||||||
- Type: string
|
|
||||||
- Default: 0.0.0.0
|
|
||||||
|
|
||||||
Instead of the network mask, you can also set OSD listen address explicitly
|
|
||||||
using this parameter. May be useful if you want to start OSDs on interfaces
|
|
||||||
that are not UP + RUNNING.
|
|
||||||
|
|
||||||
## bind_port
|
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
|
|
||||||
By default, OSDs pick random ports to use for incoming connections
|
|
||||||
automatically. With this option you can set a specific port for a specific
|
|
||||||
OSD by hand.
|
|
||||||
|
|
||||||
## autosync_interval
|
## autosync_interval
|
||||||
|
|
||||||
- Type: seconds
|
- Type: seconds
|
||||||
|
@ -316,7 +307,7 @@ for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
|
||||||
decrease write performance for fast disks because page cache is an overhead
|
decrease write performance for fast disks because page cache is an overhead
|
||||||
itself.
|
itself.
|
||||||
|
|
||||||
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
|
Choose "directsync" to use [immediate_commit](layout-cluster.en.md#immediate_commit)
|
||||||
(which requires disable_data_fsync) with drives having write-back cache
|
(which requires disable_data_fsync) with drives having write-back cache
|
||||||
which can't be turned off, for example, Intel Optane. Also note that *some*
|
which can't be turned off, for example, Intel Optane. Also note that *some*
|
||||||
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
|
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
|
||||||
|
@ -629,3 +620,30 @@ are changed to 0.
|
||||||
|
|
||||||
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
|
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
|
||||||
are treated as outliers and ignored in aggregation.
|
are treated as outliers and ignored in aggregation.
|
||||||
|
|
||||||
|
## discard_on_start
|
||||||
|
|
||||||
|
- Type: boolean
|
||||||
|
|
||||||
|
Discard (SSD TRIM) unused data device blocks on every OSD startup.
|
||||||
|
|
||||||
|
## min_discard_size
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 1048576
|
||||||
|
|
||||||
|
Minimum consecutive block size to TRIM it.
|
||||||
|
|
||||||
|
## allow_net_split
|
||||||
|
|
||||||
|
- Type: boolean
|
||||||
|
- Default: false
|
||||||
|
|
||||||
|
Allow "safe" cases of network splits/partitions - allow to start PGs without
|
||||||
|
connections to some OSDs currently registered as alive in etcd, if the number
|
||||||
|
of actually connected PG OSDs is at least pg_minsize. That is, allow some OSDs to lose
|
||||||
|
connectivity with some other OSDs as long as it doesn't break pg_minsize guarantees.
|
||||||
|
The downside is that it increases the probability of writing data into just pg_minsize
|
||||||
|
OSDs during failover which can lead to PGs becoming incomplete after additional outages.
|
||||||
|
|
||||||
|
The old behaviour in versions up to 2.0.0 was equal to enabled allow_net_split.
|
||||||
|
|
|
@ -8,16 +8,15 @@
|
||||||
|
|
||||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
||||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
||||||
момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
|
момент с перезапуском OSD в /etc/vitastor/vitastor.conf или [vitastor-disk update-sb](../usage/disk.ru.md#update-sb),
|
||||||
изменения конфигурации в etcd.
|
а некоторые и без перезапуска, с помощью изменения конфигурации в etcd.
|
||||||
|
|
||||||
|
- [bind_address](#bind_address)
|
||||||
|
- [bind_port](#bind_port)
|
||||||
- [osd_iothread_count](#osd_iothread_count)
|
- [osd_iothread_count](#osd_iothread_count)
|
||||||
- [etcd_report_interval](#etcd_report_interval)
|
- [etcd_report_interval](#etcd_report_interval)
|
||||||
- [etcd_stats_interval](#etcd_stats_interval)
|
- [etcd_stats_interval](#etcd_stats_interval)
|
||||||
- [run_primary](#run_primary)
|
- [run_primary](#run_primary)
|
||||||
- [osd_network](#osd_network)
|
|
||||||
- [bind_address](#bind_address)
|
|
||||||
- [bind_port](#bind_port)
|
|
||||||
- [autosync_interval](#autosync_interval)
|
- [autosync_interval](#autosync_interval)
|
||||||
- [autosync_writes](#autosync_writes)
|
- [autosync_writes](#autosync_writes)
|
||||||
- [recovery_queue_depth](#recovery_queue_depth)
|
- [recovery_queue_depth](#recovery_queue_depth)
|
||||||
|
@ -62,6 +61,26 @@
|
||||||
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
||||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||||
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||||
|
- [discard_on_start](#discard_on_start)
|
||||||
|
- [min_discard_size](#min_discard_size)
|
||||||
|
- [allow_net_split](#allow_net_split)
|
||||||
|
|
||||||
|
## bind_address
|
||||||
|
|
||||||
|
- Тип: строка или массив строк
|
||||||
|
|
||||||
|
Вместо использования масок подсети ([osd_network](network.ru.md#osd_network) и
|
||||||
|
[osd_cluster_network](network.ru.md#osd_cluster_network)), вы также можете явно
|
||||||
|
задать адрес(а), на которых будут ожидать соединений OSD, с помощью данного
|
||||||
|
параметра. Это может быть полезно, например, чтобы запускать OSD на неподнятых
|
||||||
|
интерфейсах (не UP + RUNNING).
|
||||||
|
|
||||||
|
## bind_port
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
|
||||||
|
По умолчанию OSD сами выбирают случайные порты для входящих подключений.
|
||||||
|
С помощью данной опции вы можете задать порт для отдельного OSD вручную.
|
||||||
|
|
||||||
## osd_iothread_count
|
## osd_iothread_count
|
||||||
|
|
||||||
|
@ -107,34 +126,6 @@ max_etcd_attempts * etcd_quick_timeout.
|
||||||
первичные OSD от вторичных, но пока не понятно, зачем это может кому-то
|
первичные OSD от вторичных, но пока не понятно, зачем это может кому-то
|
||||||
понадобиться, поэтому это не реализовано.
|
понадобиться, поэтому это не реализовано.
|
||||||
|
|
||||||
## osd_network
|
|
||||||
|
|
||||||
- Тип: строка или массив строк
|
|
||||||
|
|
||||||
Маска подсети (IPv4 или IPv6) для использования для соединений с OSD.
|
|
||||||
Имейте в виду, что хотя сейчас и можно передать в этот параметр несколько
|
|
||||||
подсетей, это не означает, что OSD будут создавать несколько слушающих
|
|
||||||
сокетов - они лишь будут выбирать адрес первого поднятого (состояние UP +
|
|
||||||
RUNNING), подходящий под заданную маску. Также не реализовано разделение
|
|
||||||
кластерной и публичной сетей OSD. Правда, от него обычно всё равно довольно
|
|
||||||
мало толку, так что особенной проблемы в этом нет.
|
|
||||||
|
|
||||||
## bind_address
|
|
||||||
|
|
||||||
- Тип: строка
|
|
||||||
- Значение по умолчанию: 0.0.0.0
|
|
||||||
|
|
||||||
Этим параметром можно явным образом задать адрес, на котором будет ожидать
|
|
||||||
соединений OSD (вместо использования маски подсети). Может быть полезно,
|
|
||||||
например, чтобы запускать OSD на неподнятых интерфейсах (не UP + RUNNING).
|
|
||||||
|
|
||||||
## bind_port
|
|
||||||
|
|
||||||
- Тип: целое число
|
|
||||||
|
|
||||||
По умолчанию OSD сами выбирают случайные порты для входящих подключений.
|
|
||||||
С помощью данной опции вы можете задать порт для отдельного OSD вручную.
|
|
||||||
|
|
||||||
## autosync_interval
|
## autosync_interval
|
||||||
|
|
||||||
- Тип: секунды
|
- Тип: секунды
|
||||||
|
@ -660,3 +651,31 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
|
||||||
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||||
Большие значения считаются случайными выбросами и игнорируются в
|
Большие значения считаются случайными выбросами и игнорируются в
|
||||||
усреднении.
|
усреднении.
|
||||||
|
|
||||||
|
## discard_on_start
|
||||||
|
|
||||||
|
- Тип: булево (да/нет)
|
||||||
|
|
||||||
|
Освобождать (SSD TRIM) неиспользуемые блоки диска данных при каждом запуске OSD.
|
||||||
|
|
||||||
|
## min_discard_size
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 1048576
|
||||||
|
|
||||||
|
Минимальный размер последовательного блока данных, чтобы освобождать его через TRIM.
|
||||||
|
|
||||||
|
## allow_net_split
|
||||||
|
|
||||||
|
- Тип: булево (да/нет)
|
||||||
|
- Значение по умолчанию: false
|
||||||
|
|
||||||
|
Разрешить "безопасные" случаи разделений сети - разрешить активировать PG без
|
||||||
|
соединений к некоторым OSD, помеченным активными в etcd, если общее число активных
|
||||||
|
OSD в PG составляет как минимум pg_minsize. То есть, разрешать некоторым OSD терять
|
||||||
|
соединения с некоторыми другими OSD, если это не нарушает гарантий pg_minsize.
|
||||||
|
Минус такого разрешения в том, что оно повышает вероятность записи данных ровно в
|
||||||
|
pg_minsize OSD во время переключений, что может потом привести к тому, что PG станут
|
||||||
|
неполными (incomplete), если упадут ещё какие-то OSD.
|
||||||
|
|
||||||
|
Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
|
||||||
|
|
|
@ -43,7 +43,7 @@ Parameters:
|
||||||
- [osd_tags](#osd_tags)
|
- [osd_tags](#osd_tags)
|
||||||
- [primary_affinity_tags](#primary_affinity_tags)
|
- [primary_affinity_tags](#primary_affinity_tags)
|
||||||
- [scrub_interval](#scrub_interval)
|
- [scrub_interval](#scrub_interval)
|
||||||
- [used_for_fs](#used_for_fs)
|
- [used_for_app](#used_for_app)
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
|
@ -189,6 +189,9 @@ So, pg_minsize regulates the number of failures that a pool can tolerate
|
||||||
without temporary downtime for [osd_out_time](monitor.en.md#osd_out_time),
|
without temporary downtime for [osd_out_time](monitor.en.md#osd_out_time),
|
||||||
but at a cost of slightly reduced storage reliability.
|
but at a cost of slightly reduced storage reliability.
|
||||||
|
|
||||||
|
See also [allow_net_split](osd.en.md#allow_net_split) and
|
||||||
|
[PG state descriptions](../usage/admin.en.md#pg-states).
|
||||||
|
|
||||||
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
|
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
|
||||||
read-only instead of deactivating them.
|
read-only instead of deactivating them.
|
||||||
|
|
||||||
|
@ -377,24 +380,37 @@ of the OSDs containing a data chunk for a PG.
|
||||||
Automatic scrubbing interval for this pool. Overrides
|
Automatic scrubbing interval for this pool. Overrides
|
||||||
[global scrub_interval setting](osd.en.md#scrub_interval).
|
[global scrub_interval setting](osd.en.md#scrub_interval).
|
||||||
|
|
||||||
## used_for_fs
|
## used_for_app
|
||||||
|
|
||||||
- Type: string
|
- Type: string
|
||||||
|
|
||||||
If non-empty, the pool is marked as used for VitastorFS with metadata stored
|
If non-empty, the pool is marked as used for a separate application, for example,
|
||||||
in block image (regular Vitastor volume) named as the value of this pool parameter.
|
VitastorFS or S3, which allocates Vitastor volume IDs by itself and does not use
|
||||||
|
image/inode metadata in etcd.
|
||||||
|
|
||||||
When a pool is marked as used for VitastorFS, regular block volume creation in it
|
When a pool is marked as used for such app, regular block volume creation in it
|
||||||
is disabled (vitastor-cli refuses to create images without --force) to protect
|
is disabled (vitastor-cli refuses to create images without --force) to protect
|
||||||
the user from block volume and FS file ID collisions and data loss.
|
the user from block volume and FS/S3 volume ID collisions and data loss.
|
||||||
|
|
||||||
[vitastor-nfs](../usage/nfs.ru.md), in its turn, refuses to use pools not marked
|
Also such pools do not calculate per-inode space usage statistics in etcd because
|
||||||
|
using it for an external application implies that it may contain a very large
|
||||||
|
number of volumes and their statistics may take too much space in etcd.
|
||||||
|
|
||||||
|
Setting used_for_app to `fs:<name>` tells Vitastor that the pool is used for VitastorFS
|
||||||
|
with VitastorKV metadata base stored in a block image (regular Vitastor volume) named
|
||||||
|
`<name>`.
|
||||||
|
|
||||||
|
[vitastor-nfs](../usage/nfs.en.md), in its turn, refuses to use pools not marked
|
||||||
for the corresponding FS when starting. This also implies that you can use one
|
for the corresponding FS when starting. This also implies that you can use one
|
||||||
pool only for one VitastorFS.
|
pool only for one VitastorFS.
|
||||||
|
|
||||||
The second thing that is disabled for VitastorFS pools is reporting per-inode space
|
If you plan to use the pool for S3, set its used_for_app to `s3:<name>`. `<name>` may
|
||||||
usage statistics in etcd because a FS pool may store a very large number of files
|
be basically anything you want (for example, `s3:standard`) - it's not validated
|
||||||
and statistics for them all would take a lot of space in etcd.
|
by Vitastor S3 components in any way.
|
||||||
|
|
||||||
|
All other values except prefixed with `fs:` or `s3:` may be used freely and don't
|
||||||
|
mean anything special for Vitastor core components. For now, you can use them as
|
||||||
|
you wish.
|
||||||
|
|
||||||
# Examples
|
# Examples
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
- [osd_tags](#osd_tags)
|
- [osd_tags](#osd_tags)
|
||||||
- [primary_affinity_tags](#primary_affinity_tags)
|
- [primary_affinity_tags](#primary_affinity_tags)
|
||||||
- [scrub_interval](#scrub_interval)
|
- [scrub_interval](#scrub_interval)
|
||||||
- [used_for_fs](#used_for_fs)
|
- [used_for_app](#used_for_app)
|
||||||
|
|
||||||
Примеры:
|
Примеры:
|
||||||
|
|
||||||
|
@ -256,7 +256,7 @@ PG в Vitastor эферемерны, то есть вы можете менят
|
||||||
|
|
||||||
## raw_placement
|
## raw_placement
|
||||||
|
|
||||||
- Type: string
|
- Тип: строка
|
||||||
|
|
||||||
Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
|
Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
|
||||||
Используйте, только если действительно знаете, зачем вам это надо :)
|
Используйте, только если действительно знаете, зачем вам это надо :)
|
||||||
|
@ -383,26 +383,42 @@ OSD с "all".
|
||||||
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
|
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
|
||||||
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
|
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
|
||||||
|
|
||||||
## used_for_fs
|
## used_for_app
|
||||||
|
|
||||||
- Type: string
|
- Тип: строка
|
||||||
|
|
||||||
Если непусто, пул помечается как используемый для файловой системы VitastorFS с
|
Если непусто, пул помечается как используемый для отдельного приложения, например,
|
||||||
метаданными, хранимыми в блочном образе Vitastor с именем, равным значению
|
для VitastorFS или S3, которое распределяет ID образов в пуле само и не использует
|
||||||
этого параметра.
|
метаданные образов/инодов в etcd.
|
||||||
|
|
||||||
Когда пул помечается как используемый для VitastorFS, создание обычных блочных
|
Когда пул помечается используемым для такого приложения, создание обычных блочных
|
||||||
образов в нём отключается (vitastor-cli отказывается создавать образы без --force),
|
образов в нём запрещается (vitastor-cli отказывается создавать образы без --force),
|
||||||
чтобы защитить пользователя от коллизий ID файлов и блочных образов и, таким
|
чтобы защитить пользователя от коллизий ID блочных образов и томов ФС/S3, и,
|
||||||
образом, от потери данных.
|
таким образом, от потери данных.
|
||||||
|
|
||||||
|
Также для таких пулов отключается передача статистики в etcd по отдельным инодам,
|
||||||
|
так как использование для внешнего приложения подразумевает, что пул может содержать
|
||||||
|
очень много томов и их статистика может занять слишком много места в etcd.
|
||||||
|
|
||||||
|
Установка used_for_app в значение `fs:<name>` сообщает о том, что пул используется
|
||||||
|
для VitastorFS с базой метаданных VitastorKV, хранимой в блочном образе с именем
|
||||||
|
`<name>`.
|
||||||
|
|
||||||
[vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
|
[vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
|
||||||
использовать для ФС пулы, не выделенные для неё. Это также означает, что один
|
использовать для ФС пулы, не помеченные, как используемые для неё. Это также
|
||||||
пул может использоваться только для одной VitastorFS.
|
означает, что один пул может использоваться только для одной VitastorFS.
|
||||||
|
|
||||||
Также для ФС-пулов отключается передача статистики в etcd по отдельным инодам,
|
Если же вы планируете использовать пул для данных S3, установите его used_for_app
|
||||||
так как ФС-пул может содержать очень много файлов и статистика по ним всем
|
в значение `s3:<name>`, где `<name>` - любое название по вашему усмотрению
|
||||||
заняла бы очень много места в etcd.
|
(например, `s3:standard`) - конкретное содержимое `<name>` пока никак не проверяется
|
||||||
|
компонентами Vitastor S3.
|
||||||
|
|
||||||
|
Смотрите также [allow_net_split](osd.ru.md#allow_net_split) и
|
||||||
|
[документацию по состояниям PG](../usage/admin.ru.md#состояния-pg).
|
||||||
|
|
||||||
|
Все остальные значения used_for_app, кроме начинающихся на `fs:` или `s3:`, не
|
||||||
|
означают ничего особенного для основных компонентов Vitastor. Поэтому сейчас вы
|
||||||
|
можете использовать их свободно любым желаемым способом.
|
||||||
|
|
||||||
# Примеры
|
# Примеры
|
||||||
|
|
||||||
|
|
|
@ -75,11 +75,11 @@
|
||||||
- name: mon_http_port
|
- name: mon_http_port
|
||||||
type: int
|
type: int
|
||||||
default: 8060
|
default: 8060
|
||||||
info: HTTP port for monitors to listen on (including metrics exporter)
|
info: HTTP port for monitors to listen to (including metrics exporter)
|
||||||
info_ru: Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
|
info_ru: Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
|
||||||
- name: mon_http_ip
|
- name: mon_http_ip
|
||||||
type: string
|
type: string
|
||||||
info: IP address for monitors to listen on (all addresses by default)
|
info: IP address for monitors to listen to (all addresses by default)
|
||||||
info_ru: IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
|
info_ru: IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
|
||||||
- name: mon_https_cert
|
- name: mon_https_cert
|
||||||
type: string
|
type: string
|
||||||
|
|
|
@ -1,49 +1,78 @@
|
||||||
- name: tcp_header_buffer_size
|
- name: osd_network
|
||||||
type: int
|
type: string or array of strings
|
||||||
default: 65536
|
type_ru: строка или массив строк
|
||||||
info: |
|
info: |
|
||||||
Size of the buffer used to read data using an additional copy. Vitastor
|
Network mask of public OSD network(s) (IPv4 or IPv6). Each OSD listens to all
|
||||||
packet headers are 128 bytes, payload is always at least 4 KB, so it is
|
addresses of UP + RUNNING interfaces matching one of these networks, on the
|
||||||
usually beneficial to try to read multiple packets at once even though
|
same port. Port is auto-selected except if [bind_port](osd.en.md#bind_port) is
|
||||||
it requires to copy the data an additional time. The rest of each packet
|
explicitly specified. Bind address(es) may also be overridden manually by
|
||||||
is received without an additional copy. You can try to play with this
|
specifying [bind_address](osd.en.md#bind_address). If OSD networks are not specified
|
||||||
parameter and see how it affects random iops and linear bandwidth if you
|
at all, OSD just listens to a wildcard address (0.0.0.0).
|
||||||
want.
|
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Размер буфера для чтения данных с дополнительным копированием. Пакеты
|
Маски подсетей (IPv4 или IPv6) публичной сети или сетей OSD. Каждый OSD слушает
|
||||||
Vitastor содержат 128-байтные заголовки, за которыми следуют данные размером
|
один и тот же порт на всех адресах поднятых (UP + RUNNING) сетевых интерфейсов,
|
||||||
от 4 КБ и для мелких операций ввода-вывода обычно выгодно за 1 вызов читать
|
соответствующих одной из указанных сетей. Порт выбирается автоматически, если
|
||||||
сразу несколько пакетов, даже не смотря на то, что это требует лишний раз
|
только [bind_port](osd.ru.md#bind_port) не задан явно. Адреса для подключений можно
|
||||||
скопировать данные. Часть каждого пакета за пределами значения данного
|
также переопределить явно, задав [bind_address](osd.ru.md#bind_address). Если сети OSD
|
||||||
параметра читается без дополнительного копирования. Вы можете попробовать
|
не заданы вообще, OSD слушает все адреса (0.0.0.0).
|
||||||
поменять этот параметр и посмотреть, как он влияет на производительность
|
- name: osd_cluster_network
|
||||||
случайного и линейного доступа.
|
type: string or array of strings
|
||||||
- name: use_sync_send_recv
|
type_ru: строка или массив строк
|
||||||
type: bool
|
|
||||||
default: false
|
|
||||||
info: |
|
info: |
|
||||||
If true, synchronous send/recv syscalls are used instead of io_uring for
|
Network mask of separate network(s) (IPv4 or IPv6) to use for OSD
|
||||||
socket communication. Useless for OSDs because they require io_uring anyway,
|
cluster connections. I.e. OSDs will always attempt to use these networks
|
||||||
but may be required for clients with old kernel versions.
|
to connect to other OSDs, while clients will attempt to use networks from
|
||||||
|
[osd_network](#osd_network).
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Если установлено в истину, то вместо io_uring для передачи данных по сети
|
Маски подсетей (IPv4 или IPv6) отдельной кластерной сети или сетей OSD.
|
||||||
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
То есть, OSD будут всегда стараться использовать эти сети для соединений
|
||||||
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
с другими OSD, а клиенты будут стараться использовать сети из [osd_network](#osd_network).
|
||||||
принципе, это может применяться для клиентов со старыми версиями ядра.
|
|
||||||
- name: use_rdma
|
- name: use_rdma
|
||||||
type: bool
|
type: bool
|
||||||
default: true
|
default: true
|
||||||
info: |
|
info: |
|
||||||
Try to use RDMA for communication if it's available. Disable if you don't
|
Try to use RDMA through libibverbs for communication if it's available.
|
||||||
want Vitastor to use RDMA. TCP-only clients can also talk to an RDMA-enabled
|
Disable if you don't want Vitastor to use RDMA. TCP-only clients can also
|
||||||
cluster, so disabling RDMA may be needed if clients have RDMA devices,
|
talk to an RDMA-enabled cluster, so disabling RDMA may be needed if clients
|
||||||
but they are not connected to the cluster.
|
have RDMA devices, but they are not connected to the cluster.
|
||||||
|
|
||||||
|
`use_rdma` works with RoCEv1/RoCEv2 networks, but not with iWARP and,
|
||||||
|
maybe, with some Infiniband configurations which require RDMA-CM.
|
||||||
|
Consider `use_rdmacm` for such networks.
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Пытаться использовать RDMA для связи при наличии доступных устройств.
|
Попробовать использовать RDMA через libibverbs для связи при наличии
|
||||||
Отключите, если вы не хотите, чтобы Vitastor использовал RDMA.
|
доступных устройств. Отключите, если вы не хотите, чтобы Vitastor
|
||||||
TCP-клиенты также могут работать с RDMA-кластером, так что отключать
|
использовал RDMA. TCP-клиенты также могут работать с RDMA-кластером,
|
||||||
RDMA может быть нужно только если у клиентов есть RDMA-устройства,
|
так что отключать RDMA может быть нужно, только если у клиентов есть
|
||||||
но они не имеют соединения с кластером Vitastor.
|
RDMA-устройства, но они не имеют соединения с кластером Vitastor.
|
||||||
|
|
||||||
|
`use_rdma` работает с RoCEv1/RoCEv2 сетями, но не работает с iWARP и
|
||||||
|
может не работать с частью конфигураций Infiniband, требующих RDMA-CM.
|
||||||
|
Рассмотрите включение `use_rdmacm` для таких сетей.
|
||||||
|
- name: use_rdmacm
|
||||||
|
type: bool
|
||||||
|
default: true
|
||||||
|
info: |
|
||||||
|
Use an alternative implementation of RDMA through RDMA-CM (Connection
|
||||||
|
Manager). Works with all RDMA networks: Infiniband, iWARP and
|
||||||
|
RoCEv1/RoCEv2, and even allows to disable TCP and run only with RDMA.
|
||||||
|
OSDs always use random port numbers for RDMA-CM listeners, different
|
||||||
|
from their TCP ports. `use_rdma` is automatically disabled when
|
||||||
|
`use_rdmacm` is enabled.
|
||||||
|
info_ru: |
|
||||||
|
Использовать альтернативную реализацию RDMA на основе RDMA-CM (Connection
|
||||||
|
Manager). Работает со всеми типами RDMA-сетей: Infiniband, iWARP и
|
||||||
|
RoCEv1/RoCEv2, и даже позволяет полностью отключить TCP и работать
|
||||||
|
только на RDMA. OSD используют случайные номера портов для ожидания
|
||||||
|
соединений через RDMA-CM, отличающиеся от их TCP-портов. Также при
|
||||||
|
включении `use_rdmacm` автоматически отключается опция `use_rdma`.
|
||||||
|
- name: disable_tcp
|
||||||
|
type: bool
|
||||||
|
default: true
|
||||||
|
info: |
|
||||||
|
Fully disable TCP and only use RDMA-CM for OSD communication.
|
||||||
|
info_ru: |
|
||||||
|
Полностью отключить TCP и использовать только RDMA-CM для соединений с OSD.
|
||||||
- name: rdma_device
|
- name: rdma_device
|
||||||
type: string
|
type: string
|
||||||
info: |
|
info: |
|
||||||
|
@ -93,16 +122,19 @@
|
||||||
Control) и ECN (Explicit Congestion Notification).
|
Control) и ECN (Explicit Congestion Notification).
|
||||||
- name: rdma_port_num
|
- name: rdma_port_num
|
||||||
type: int
|
type: int
|
||||||
default: 1
|
|
||||||
info: |
|
info: |
|
||||||
RDMA device port number to use. Only for devices that have more than 1 port.
|
RDMA device port number to use. Only for devices that have more than 1 port.
|
||||||
See `phys_port_cnt` in `ibv_devinfo -v` output to determine how many ports
|
See `phys_port_cnt` in `ibv_devinfo -v` output to determine how many ports
|
||||||
your device has.
|
your device has.
|
||||||
|
|
||||||
|
Not relevant for RDMA-CM (use_rdmacm).
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Номер порта RDMA-устройства, который следует использовать. Имеет смысл
|
Номер порта RDMA-устройства, который следует использовать. Имеет смысл
|
||||||
только для устройств, у которых более 1 порта. Чтобы узнать, сколько портов
|
только для устройств, у которых более 1 порта. Чтобы узнать, сколько портов
|
||||||
у вашего адаптера, посмотрите `phys_port_cnt` в выводе команды
|
у вашего адаптера, посмотрите `phys_port_cnt` в выводе команды
|
||||||
`ibv_devinfo -v`.
|
`ibv_devinfo -v`.
|
||||||
|
|
||||||
|
Опция неприменима к RDMA-CM (use_rdmacm).
|
||||||
- name: rdma_gid_index
|
- name: rdma_gid_index
|
||||||
type: int
|
type: int
|
||||||
info: |
|
info: |
|
||||||
|
@ -116,6 +148,8 @@
|
||||||
GID auto-selection is unsupported with libibverbs < v32.
|
GID auto-selection is unsupported with libibverbs < v32.
|
||||||
|
|
||||||
A correct rdma_gid_index for RoCEv2 is usually 1 (IPv6) or 3 (IPv4).
|
A correct rdma_gid_index for RoCEv2 is usually 1 (IPv6) or 3 (IPv4).
|
||||||
|
|
||||||
|
Not relevant for RDMA-CM (use_rdmacm).
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Номер глобального идентификатора адреса RDMA-устройства, который следует
|
Номер глобального идентификатора адреса RDMA-устройства, который следует
|
||||||
использовать. Разным gid_index могут соответствовать разные протоколы связи:
|
использовать. Разным gid_index могут соответствовать разные протоколы связи:
|
||||||
|
@ -128,15 +162,16 @@
|
||||||
libibverbs < v32.
|
libibverbs < v32.
|
||||||
|
|
||||||
Правильный rdma_gid_index для RoCEv2, как правило, 1 (IPv6) или 3 (IPv4).
|
Правильный rdma_gid_index для RoCEv2, как правило, 1 (IPv6) или 3 (IPv4).
|
||||||
|
|
||||||
|
Опция неприменима к RDMA-CM (use_rdmacm).
|
||||||
- name: rdma_mtu
|
- name: rdma_mtu
|
||||||
type: int
|
type: int
|
||||||
default: 4096
|
|
||||||
info: |
|
info: |
|
||||||
RDMA Path MTU to use. Must be 1024, 2048 or 4096. There is usually no
|
RDMA Path MTU to use. Must be 1024, 2048 or 4096. Default is to use the
|
||||||
sense to change it from the default 4096.
|
RDMA device's MTU.
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Максимальная единица передачи (Path MTU) для RDMA. Должно быть равно 1024,
|
Максимальная единица передачи (Path MTU) для RDMA. Должно быть равно 1024,
|
||||||
2048 или 4096. Обычно нет смысла менять значение по умолчанию, равное 4096.
|
2048 или 4096. По умолчанию используется значение MTU RDMA-устройства.
|
||||||
- name: rdma_max_sge
|
- name: rdma_max_sge
|
||||||
type: int
|
type: int
|
||||||
default: 128
|
default: 128
|
||||||
|
@ -306,3 +341,47 @@
|
||||||
detect disconnections quickly.
|
detect disconnections quickly.
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Интервал проверки живости вебсокет-подключений к etcd.
|
Интервал проверки живости вебсокет-подключений к etcd.
|
||||||
|
- name: etcd_min_reload_interval
|
||||||
|
type: ms
|
||||||
|
default: 1000
|
||||||
|
online: true
|
||||||
|
info: |
|
||||||
|
Minimum interval for full etcd state reload. Introduced to prevent
|
||||||
|
excessive load on etcd during outages when etcd can't keep up with event
|
||||||
|
streams and cancels them.
|
||||||
|
info_ru: |
|
||||||
|
Минимальный интервал полной перезагрузки состояния из etcd. Добавлено для
|
||||||
|
предотвращения избыточной нагрузки на etcd во время отказов, когда etcd не
|
||||||
|
успевает рассылать потоки событий и отменяет их.
|
||||||
|
- name: tcp_header_buffer_size
|
||||||
|
type: int
|
||||||
|
default: 65536
|
||||||
|
info: |
|
||||||
|
Size of the buffer used to read data using an additional copy. Vitastor
|
||||||
|
packet headers are 128 bytes, payload is always at least 4 KB, so it is
|
||||||
|
usually beneficial to try to read multiple packets at once even though
|
||||||
|
it requires to copy the data an additional time. The rest of each packet
|
||||||
|
is received without an additional copy. You can try to play with this
|
||||||
|
parameter and see how it affects random iops and linear bandwidth if you
|
||||||
|
want.
|
||||||
|
info_ru: |
|
||||||
|
Размер буфера для чтения данных с дополнительным копированием. Пакеты
|
||||||
|
Vitastor содержат 128-байтные заголовки, за которыми следуют данные размером
|
||||||
|
от 4 КБ и для мелких операций ввода-вывода обычно выгодно за 1 вызов читать
|
||||||
|
сразу несколько пакетов, даже не смотря на то, что это требует лишний раз
|
||||||
|
скопировать данные. Часть каждого пакета за пределами значения данного
|
||||||
|
параметра читается без дополнительного копирования. Вы можете попробовать
|
||||||
|
поменять этот параметр и посмотреть, как он влияет на производительность
|
||||||
|
случайного и линейного доступа.
|
||||||
|
- name: use_sync_send_recv
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
If true, synchronous send/recv syscalls are used instead of io_uring for
|
||||||
|
socket communication. Useless for OSDs because they require io_uring anyway,
|
||||||
|
but may be required for clients with old kernel versions.
|
||||||
|
info_ru: |
|
||||||
|
Если установлено в истину, то вместо io_uring для передачи данных по сети
|
||||||
|
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
||||||
|
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
||||||
|
принципе, это может применяться для клиентов со старыми версиями ядра.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Runtime OSD Parameters
|
# Runtime OSD Parameters
|
||||||
|
|
||||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||||
initialization and can be changed - either with an OSD restart or, for some of
|
initialization and can be changed - in /etc/vitastor/vitastor.conf or [vitastor-disk update-sb](../usage/disk.en.md#update-sb)
|
||||||
them, even without restarting by updating configuration in etcd.
|
with an OSD restart or, for some of them, even without restarting by updating configuration in etcd.
|
||||||
|
|
|
@ -2,5 +2,5 @@
|
||||||
|
|
||||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
||||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
||||||
момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
|
момент с перезапуском OSD в /etc/vitastor/vitastor.conf или [vitastor-disk update-sb](../usage/disk.ru.md#update-sb),
|
||||||
изменения конфигурации в etcd.
|
а некоторые и без перезапуска, с помощью изменения конфигурации в etcd.
|
||||||
|
|
|
@ -1,3 +1,26 @@
|
||||||
|
- name: bind_address
|
||||||
|
type: string or array of strings
|
||||||
|
type_ru: строка или массив строк
|
||||||
|
info: |
|
||||||
|
Instead of the network masks ([osd_network](network.en.md#osd_network) and
|
||||||
|
[osd_cluster_network](network.en.md#osd_cluster_network)), you can also set
|
||||||
|
OSD listen addresses explicitly using this parameter. May be useful if you
|
||||||
|
want to start OSDs on interfaces that are not UP + RUNNING.
|
||||||
|
info_ru: |
|
||||||
|
Вместо использования масок подсети ([osd_network](network.ru.md#osd_network) и
|
||||||
|
[osd_cluster_network](network.ru.md#osd_cluster_network)), вы также можете явно
|
||||||
|
задать адрес(а), на которых будут ожидать соединений OSD, с помощью данного
|
||||||
|
параметра. Это может быть полезно, например, чтобы запускать OSD на неподнятых
|
||||||
|
интерфейсах (не UP + RUNNING).
|
||||||
|
- name: bind_port
|
||||||
|
type: int
|
||||||
|
info: |
|
||||||
|
By default, OSDs pick random ports to use for incoming connections
|
||||||
|
automatically. With this option you can set a specific port for a specific
|
||||||
|
OSD by hand.
|
||||||
|
info_ru: |
|
||||||
|
По умолчанию OSD сами выбирают случайные порты для входящих подключений.
|
||||||
|
С помощью данной опции вы можете задать порт для отдельного OSD вручную.
|
||||||
- name: osd_iothread_count
|
- name: osd_iothread_count
|
||||||
type: int
|
type: int
|
||||||
default: 0
|
default: 0
|
||||||
|
@ -56,44 +79,6 @@
|
||||||
реализовать дополнительный режим для монитора, который позволит отделять
|
реализовать дополнительный режим для монитора, который позволит отделять
|
||||||
первичные OSD от вторичных, но пока не понятно, зачем это может кому-то
|
первичные OSD от вторичных, но пока не понятно, зачем это может кому-то
|
||||||
понадобиться, поэтому это не реализовано.
|
понадобиться, поэтому это не реализовано.
|
||||||
- name: osd_network
|
|
||||||
type: string or array of strings
|
|
||||||
type_ru: строка или массив строк
|
|
||||||
info: |
|
|
||||||
Network mask of the network (IPv4 or IPv6) to use for OSDs. Note that
|
|
||||||
although it's possible to specify multiple networks here, this does not
|
|
||||||
mean that OSDs will create multiple listening sockets - they'll only
|
|
||||||
pick the first matching address of an UP + RUNNING interface. Separate
|
|
||||||
networks for cluster and client connections are also not implemented, but
|
|
||||||
they are mostly useless anyway, so it's not a big deal.
|
|
||||||
info_ru: |
|
|
||||||
Маска подсети (IPv4 или IPv6) для использования для соединений с OSD.
|
|
||||||
Имейте в виду, что хотя сейчас и можно передать в этот параметр несколько
|
|
||||||
подсетей, это не означает, что OSD будут создавать несколько слушающих
|
|
||||||
сокетов - они лишь будут выбирать адрес первого поднятого (состояние UP +
|
|
||||||
RUNNING), подходящий под заданную маску. Также не реализовано разделение
|
|
||||||
кластерной и публичной сетей OSD. Правда, от него обычно всё равно довольно
|
|
||||||
мало толку, так что особенной проблемы в этом нет.
|
|
||||||
- name: bind_address
|
|
||||||
type: string
|
|
||||||
default: "0.0.0.0"
|
|
||||||
info: |
|
|
||||||
Instead of the network mask, you can also set OSD listen address explicitly
|
|
||||||
using this parameter. May be useful if you want to start OSDs on interfaces
|
|
||||||
that are not UP + RUNNING.
|
|
||||||
info_ru: |
|
|
||||||
Этим параметром можно явным образом задать адрес, на котором будет ожидать
|
|
||||||
соединений OSD (вместо использования маски подсети). Может быть полезно,
|
|
||||||
например, чтобы запускать OSD на неподнятых интерфейсах (не UP + RUNNING).
|
|
||||||
- name: bind_port
|
|
||||||
type: int
|
|
||||||
info: |
|
|
||||||
By default, OSDs pick random ports to use for incoming connections
|
|
||||||
automatically. With this option you can set a specific port for a specific
|
|
||||||
OSD by hand.
|
|
||||||
info_ru: |
|
|
||||||
По умолчанию OSD сами выбирают случайные порты для входящих подключений.
|
|
||||||
С помощью данной опции вы можете задать порт для отдельного OSD вручную.
|
|
||||||
- name: autosync_interval
|
- name: autosync_interval
|
||||||
type: sec
|
type: sec
|
||||||
default: 5
|
default: 5
|
||||||
|
@ -315,7 +300,7 @@
|
||||||
decrease write performance for fast disks because page cache is an overhead
|
decrease write performance for fast disks because page cache is an overhead
|
||||||
itself.
|
itself.
|
||||||
|
|
||||||
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
|
Choose "directsync" to use [immediate_commit](layout-cluster.en.md#immediate_commit)
|
||||||
(which requires disable_data_fsync) with drives having write-back cache
|
(which requires disable_data_fsync) with drives having write-back cache
|
||||||
which can't be turned off, for example, Intel Optane. Also note that *some*
|
which can't be turned off, for example, Intel Optane. Also note that *some*
|
||||||
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
|
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
|
||||||
|
@ -765,3 +750,34 @@
|
||||||
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||||
Большие значения считаются случайными выбросами и игнорируются в
|
Большие значения считаются случайными выбросами и игнорируются в
|
||||||
усреднении.
|
усреднении.
|
||||||
|
- name: discard_on_start
|
||||||
|
type: bool
|
||||||
|
info: Discard (SSD TRIM) unused data device blocks on every OSD startup.
|
||||||
|
info_ru: Освобождать (SSD TRIM) неиспользуемые блоки диска данных при каждом запуске OSD.
|
||||||
|
- name: min_discard_size
|
||||||
|
type: int
|
||||||
|
default: 1048576
|
||||||
|
info: Minimum consecutive block size to TRIM it.
|
||||||
|
info_ru: Минимальный размер последовательного блока данных, чтобы освобождать его через TRIM.
|
||||||
|
- name: allow_net_split
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
Allow "safe" cases of network splits/partitions - allow to start PGs without
|
||||||
|
connections to some OSDs currently registered as alive in etcd, if the number
|
||||||
|
of actually connected PG OSDs is at least pg_minsize. That is, allow some OSDs to lose
|
||||||
|
connectivity with some other OSDs as long as it doesn't break pg_minsize guarantees.
|
||||||
|
The downside is that it increases the probability of writing data into just pg_minsize
|
||||||
|
OSDs during failover which can lead to PGs becoming incomplete after additional outages.
|
||||||
|
|
||||||
|
The old behaviour in versions up to 2.0.0 was equal to enabled allow_net_split.
|
||||||
|
info_ru: |
|
||||||
|
Разрешить "безопасные" случаи разделений сети - разрешить активировать PG без
|
||||||
|
соединений к некоторым OSD, помеченным активными в etcd, если общее число активных
|
||||||
|
OSD в PG составляет как минимум pg_minsize. То есть, разрешать некоторым OSD терять
|
||||||
|
соединения с некоторыми другими OSD, если это не нарушает гарантий pg_minsize.
|
||||||
|
Минус такого разрешения в том, что оно повышает вероятность записи данных ровно в
|
||||||
|
pg_minsize OSD во время переключений, что может потом привести к тому, что PG станут
|
||||||
|
неполными (incomplete), если упадут ещё какие-то OSD.
|
||||||
|
|
||||||
|
Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
|
||||||
|
|
|
@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
|
||||||
The instruction is very simple.
|
The instruction is very simple.
|
||||||
|
|
||||||
1. Download a Docker image of the desired version: \
|
1. Download a Docker image of the desired version: \
|
||||||
`docker pull vitastor:1.10.2`
|
`docker pull vitastor:2.1.0`
|
||||||
2. Install scripts to the host system: \
|
2. Install scripts to the host system: \
|
||||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:1.10.2 install.sh`
|
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:2.1.0 install.sh`
|
||||||
3. Reload udev rules: \
|
3. Reload udev rules: \
|
||||||
`udevadm control --reload-rules`
|
`udevadm control --reload-rules`
|
||||||
|
|
||||||
|
|
|
@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
|
||||||
Инструкция по установке максимально простая.
|
Инструкция по установке максимально простая.
|
||||||
|
|
||||||
1. Скачайте Docker-образ желаемой версии: \
|
1. Скачайте Docker-образ желаемой версии: \
|
||||||
`docker pull vitastor:1.10.2`
|
`docker pull vitastor:2.1.0`
|
||||||
2. Установите скрипты в хост-систему командой: \
|
2. Установите скрипты в хост-систему командой: \
|
||||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:1.10.2 install.sh`
|
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:2.1.0 install.sh`
|
||||||
3. Перезагрузите правила udev: \
|
3. Перезагрузите правила udev: \
|
||||||
`udevadm control --reload-rules`
|
`udevadm control --reload-rules`
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,191 @@
|
||||||
|
[Documentation](../../README.md#documentation) → Installation → S3 for Vitastor
|
||||||
|
|
||||||
|
-----
|
||||||
|
|
||||||
|
[Читать на русском](s3.ru.md)
|
||||||
|
|
||||||
|
# S3 for Vitastor
|
||||||
|
|
||||||
|
The moment has come - Vitastor S3 implementation based on Zenko CloudServer is released.
|
||||||
|
|
||||||
|
## Highlights
|
||||||
|
|
||||||
|
- Zenko CloudServer is implemented in node.js.
|
||||||
|
- Object metadata is stored in MongoDB.
|
||||||
|
- Modified Zenko CloudServer version is used for Vitastor. It is slightly different from
|
||||||
|
the original, has an optimised build and unneeded dependencies are stripped off.
|
||||||
|
- Object data is stored in Vitastor block volumes, but the volume metadata is stored in
|
||||||
|
the same MongoDB, not in Vitastor etcd.
|
||||||
|
- Objects are written to volumes sequentially one after another. The space is allocated
|
||||||
|
with rounding to the sector size (4 KB), so each object takes at least 4 KB.
|
||||||
|
- An important property of such storage scheme is that small objects aren't chunked into
|
||||||
|
parts in Vitastor EC N+K pools and thus don't require reads from all N disks when
|
||||||
|
downloading.
|
||||||
|
- Deleted objects are marked as deleted, but the space is only actually freed during
|
||||||
|
asynchronously executed "defragmentation" process. Defragmentation runs automatically
|
||||||
|
in the background when a volume reaches configured amount of "garbage" (20% by default).
|
||||||
|
Defragmentation copies actual objects to new volume(s) and then removes the old volume.
|
||||||
|
Defragmentation can be configured in locationConfig.json.
|
||||||
|
|
||||||
|
## Plans for future development
|
||||||
|
|
||||||
|
- User account storage in the DB instead of a static file. Original Zenko uses
|
||||||
|
a separate closed-source "Scality Vault" service for it, that's why we use
|
||||||
|
a static file for now.
|
||||||
|
- More detailed documentation.
|
||||||
|
- Support for other (and faster) key-value DBMS for object metadata storage.
|
||||||
|
- Other performance optimisations, for example, related to the used hash function -
|
||||||
|
MD5 used for Amazon compatibility purposes is relatively slow.
|
||||||
|
- Object Lifecycle support. There is a Lifecycle implementation for Zenko called
|
||||||
|
[Backbeat](https://github.com/scality/backbeat) but it's not adapted for Vitastor yet.
|
||||||
|
- Quota support. Original Zenko uses a separate "SCUBA" service for quotas, but
|
||||||
|
it's also proprietary and not available publicly.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
In a few words:
|
||||||
|
|
||||||
|
- Install MongoDB, create a user for S3 metadata DB.
|
||||||
|
- Create a Vitastor pool for S3 data.
|
||||||
|
- Download and setup the Docker container `vitalif/vitastor-zenko`.
|
||||||
|
|
||||||
|
### Setup MongoDB
|
||||||
|
|
||||||
|
You can setup MongoDB yourself, following the [MongoDB manual](https://www.mongodb.com/docs/manual/installation/).
|
||||||
|
|
||||||
|
Or you can follow the instructions below - it describes a simple example of MongoDB setup
|
||||||
|
in Docker (through docker-compose) with 3 replicas.
|
||||||
|
|
||||||
|
1. On each host, create a file `docker-compose.yml` with the content listed below.
|
||||||
|
Replace `<YOUR_PASSWORD>` with your future mongodb administrator password, and optionally
|
||||||
|
replace `0.0.0.0` with `localhost,<server_IP>`. It's recommended to either use a private IP
|
||||||
|
or [setup TLS](https://www.mongodb.com/docs/manual/tutorial/configure-ssl/) afterwards.
|
||||||
|
|
||||||
|
```
|
||||||
|
version: '3.1'
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
mongo:
|
||||||
|
container_name: mongo
|
||||||
|
image: mongo:7-jammy
|
||||||
|
restart: always
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_ROOT_USERNAME: root
|
||||||
|
MONGO_INITDB_ROOT_PASSWORD: <YOUR_PASSWORD>
|
||||||
|
network_mode: host
|
||||||
|
volumes:
|
||||||
|
- ./keyfile:/opt/keyfile
|
||||||
|
- ./mongo-data/db:/data/db
|
||||||
|
- ./mongo-data/configdb:/data/configdb
|
||||||
|
entrypoint: /bin/bash -c
|
||||||
|
command: [ "chown mongodb /opt/keyfile && chmod 600 /opt/keyfile && . /usr/local/bin/docker-entrypoint.sh mongod --replSet rs0 --keyFile /opt/keyfile --bind_ip 0.0.0.0" ]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Generate a shared cluster key using `openssl rand -base64 756 > ./keyfile` and copy
|
||||||
|
that `keyfile` to all hosts.
|
||||||
|
|
||||||
|
3. Start MongoDB on all hosts with `docker compose up -d mongo`.
|
||||||
|
|
||||||
|
4. Enter Mongo Shell with `docker exec -it mongo mongosh -u root -p <YOUR_PASSWORD> localhost/admin`
|
||||||
|
and execute the following command (replace IP addresses `10.10.10.{1,2,3}` with your host IPs):
|
||||||
|
|
||||||
|
`rs.initiate({ _id: 'rs0', members: [
|
||||||
|
{ _id: 1, host: '10.10.10.1:27017' },
|
||||||
|
{ _id: 2, host: '10.10.10.2:27017' },
|
||||||
|
{ _id: 3, host: '10.10.10.3:27017' }
|
||||||
|
] })`
|
||||||
|
|
||||||
|
5. Stay in Mongo Shell and create a user for the future S3 database:
|
||||||
|
|
||||||
|
`db.createUser({ user: 's3', pwd: '<YOUR_S3_PASSWORD>', roles: [
|
||||||
|
{ role: 'readWrite', db: 's3' },
|
||||||
|
{ role: 'dbAdmin', db: 's3' },
|
||||||
|
{ role: 'readWrite', db: 'vitastor' },
|
||||||
|
{ role: 'dbAdmin', db: 'vitastor' }
|
||||||
|
] })`
|
||||||
|
|
||||||
|
### Setup Vitastor
|
||||||
|
|
||||||
|
Create a pool in Vitastor for S3 object data, for example:
|
||||||
|
|
||||||
|
`vitastor-cli create-pool --ec 2+1 -n 512 s3-data --used_for_app s3:standard`
|
||||||
|
|
||||||
|
The `--used_for_app` options works as fool-proofing and prevents you from
|
||||||
|
accidentally creating a regular block volume in the S3 pool and overwriting some S3 data.
|
||||||
|
Also it hides inode space statistics from Vitastor etcd.
|
||||||
|
|
||||||
|
Retrieve the ID of your pool with `vitastor-cli ls-pools s3-data --detail`.
|
||||||
|
|
||||||
|
### Setup Vitastor S3
|
||||||
|
|
||||||
|
1. Add the following lines to `docker-compose.yml` (instead of `network_mode: host`,
|
||||||
|
you can use `ports: [ "8000:8000", "8002:8002" ]`):
|
||||||
|
|
||||||
|
```
|
||||||
|
zenko:
|
||||||
|
container_name: zenko
|
||||||
|
image: vitalif/vitastor-zenko
|
||||||
|
restart: always
|
||||||
|
security_opt:
|
||||||
|
- seccomp:unconfined
|
||||||
|
ulimits:
|
||||||
|
memlock: -1
|
||||||
|
network_mode: host
|
||||||
|
volumes:
|
||||||
|
- /etc/vitastor:/etc/vitastor
|
||||||
|
- /etc/vitastor/s3:/conf
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Download Docker image: `docker pull vitalif/vitastor-zenko`
|
||||||
|
|
||||||
|
3. Extract configuration file examples from the Docker image:
|
||||||
|
```
|
||||||
|
docker run --rm -it -v /etc/vitastor:/etc/vitastor -v /etc/vitastor/s3:/conf vitalif/vitastor-zenko configure.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Edit configuration files in `/etc/vitastor/s3/`:
|
||||||
|
- `config.json` - common settings.
|
||||||
|
- `authdata.json` - user accounts and access keys.
|
||||||
|
- `locationConfig.json` - S3 storage class list with placement settings.
|
||||||
|
Note: it actually contains storage classes (like STANDARD, COLD, etc)
|
||||||
|
instead of "locations" (zones like us-east-1) as in the original Zenko CloudServer.
|
||||||
|
- Put your MongoDB connection data into `config.json` and `locationConfig.json`.
|
||||||
|
- Put your Vitastor pool ID into `locationConfig.json`.
|
||||||
|
- For now, the complete list of Vitastor backend settings is only available [in the code](https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts#L94).
|
||||||
|
|
||||||
|
### Start Zenko
|
||||||
|
|
||||||
|
Start the S3 server with:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run --restart always --security-opt seccomp:unconfined --ulimit memlock=-1 --network=host \
|
||||||
|
-v /etc/vitastor:/etc/vitastor -v /etc/vitastor/s3:/conf --name zenko vitalif/vitastor-zenko
|
||||||
|
```
|
||||||
|
|
||||||
|
If you use default settings, Zenko CloudServer starts on port 8000.
|
||||||
|
The default access key is `accessKey1` with a secret key of `verySecretKey1`.
|
||||||
|
|
||||||
|
Now you can access your S3 with, for example, [s3cmd](https://s3tools.org/s3cmd):
|
||||||
|
|
||||||
|
```
|
||||||
|
s3cmd --access_key=accessKey1 --secret_key=verySecretKey1 --host=http://localhost:8000 mb s3://testbucket
|
||||||
|
```
|
||||||
|
|
||||||
|
Or even mount it with [GeeseFS](https://github.com/yandex-cloud/geesefs):
|
||||||
|
|
||||||
|
```
|
||||||
|
AWS_ACCESS_KEY_ID=accessKey1 \
|
||||||
|
AWS_SECRET_ACCESS_KEY=verySecretKey1 \
|
||||||
|
geesefs --endpoint http://localhost:8000 testbucket mountdir
|
||||||
|
```
|
||||||
|
|
||||||
|
## Author & License
|
||||||
|
|
||||||
|
- [Zenko CloudServer](https://s3-server.readthedocs.io/en/latest/) author is Scality,
|
||||||
|
licensed under [Apache License, version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
- [Vitastor](https://git.yourcmc.ru/vitalif/vitastor/) and Zenko Vitastor backend author is
|
||||||
|
Vitaliy Filippov, licensed under [VNPL-1.1](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/VNPL-1.1.txt)
|
||||||
|
(a "network copyleft" license based on AGPL/SSPL, but worded in a better way)
|
||||||
|
- Vitastor S3 repository: https://git.yourcmc.ru/vitalif/zenko-cloudserver-vitastor
|
||||||
|
- Vitastor S3 backend code: https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts
|
|
@ -0,0 +1,171 @@
|
||||||
|
[Документация](../../README-ru.md#документация) → Установка → S3 на базе Vitastor
|
||||||
|
|
||||||
|
-----
|
||||||
|
|
||||||
|
[Read in English](s3.en.md)
|
||||||
|
|
||||||
|
# S3 на базе Vitastor
|
||||||
|
|
||||||
|
Итак, свершилось - реализация Vitastor S3 на базе Zenko CloudServer достигла
|
||||||
|
состояния готовности к публикации и использованию.
|
||||||
|
|
||||||
|
## Ключевые особенности
|
||||||
|
|
||||||
|
- Zenko CloudServer реализован на node.js.
|
||||||
|
- Метаданные объектов хранятся в MongoDB.
|
||||||
|
- Поставляется модифицированная версия Zenko CloudServer, отвязанная от лишних зависимостей,
|
||||||
|
с оптимизированной сборкой и немного отличающаяся от оригинала.
|
||||||
|
- Данные объектов хранятся в блочных томах Vitastor, однако информация о самих томах
|
||||||
|
сохраняется не в etcd Vitastor, а тоже в БД на основе MongoDB.
|
||||||
|
- Объекты записываются в тома последовательно друг за другом. Место выделяется с округлением
|
||||||
|
до размера сектора (до 4 килобайт), поэтому каждый объект занимает как минимум 4 КБ.
|
||||||
|
- Благодаря такой схеме записи объектов мелкие объекты не нарезаются на части и поэтому не
|
||||||
|
требуют чтения с N дисков данных в EC N+K пулах Vitastor.
|
||||||
|
- При удалении объекты помечаются удалёнными, но место освобождается не сразу, а при
|
||||||
|
запускаемой асинхронно "дефрагментации". Дефрагментация запускается автоматически в фоне
|
||||||
|
при достижении заданного объёма "мусора" в томе (по умолчанию 20%), копирует актуальные
|
||||||
|
объекты в новые тома, после чего очищает старый том полностью. Дефрагментацию можно
|
||||||
|
настраивать в locationConfig.json.
|
||||||
|
|
||||||
|
## Планы развития
|
||||||
|
|
||||||
|
- Хранение учётных записей в БД, а не в статическом файле (в оригинальном Zenko для
|
||||||
|
этого используется отдельный закрытый сервис "Scality Vault").
|
||||||
|
- Более подробная документация.
|
||||||
|
- Поддержка других (и более производительных) key-value СУБД для хранения метаданных.
|
||||||
|
- Другие оптимизации производительности, например, в области используемой хеш-функции
|
||||||
|
(хеш MD5, используемый в целях совместимости, относительно медленный).
|
||||||
|
- Поддержка Object Lifecycle. Реализация Lifecycle для Zenko существует и называется
|
||||||
|
[Backbeat](https://github.com/scality/backbeat), но она ещё не адаптирована для Vitastor.
|
||||||
|
- Квоты. В оригинальном Zenko для этого используется отдельный сервис "SCUBA", однако
|
||||||
|
он тоже является закрытым и недоступен для публичного использования.
|
||||||
|
|
||||||
|
## Установка
|
||||||
|
|
||||||
|
Кратко:
|
||||||
|
|
||||||
|
- Установите MongoDB, создайте пользователя для БД метаданных S3.
|
||||||
|
- Создайте в Vitastor пул для хранения данных объектов.
|
||||||
|
- Скачайте и настройте Docker-контейнер `vitalif/vitastor-zenko`.
|
||||||
|
|
||||||
|
### Установка MongoDB
|
||||||
|
|
||||||
|
Вы можете установить MongoDB сами, следуя [официальному руководству MongoDB](https://www.mongodb.com/docs/manual/installation/).
|
||||||
|
|
||||||
|
Либо вы можете последовать инструкции, приведённой ниже - здесь описан простейший пример
|
||||||
|
установки MongoDB в Docker (docker-compose) в конфигурации с 3 репликами.
|
||||||
|
|
||||||
|
1. На всех 3 серверах создайте файл `docker-compose.yml`, заменив `<ВАШ_ПАРОЛЬ>`
|
||||||
|
на собственный будущий пароль администратора mongodb, а `0.0.0.0` по желанию
|
||||||
|
заменив на на `localhost,<IP_сервера>` - желательно либо использовать публично не доступный IP,
|
||||||
|
либо потом [настроить TLS](https://www.mongodb.com/docs/manual/tutorial/configure-ssl/).
|
||||||
|
|
||||||
|
```
|
||||||
|
version: '3.1'
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
mongo:
|
||||||
|
container_name: mongo
|
||||||
|
image: mongo:7-jammy
|
||||||
|
restart: always
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_ROOT_USERNAME: root
|
||||||
|
MONGO_INITDB_ROOT_PASSWORD: <ВАШ_ПАРОЛЬ>
|
||||||
|
network_mode: host
|
||||||
|
volumes:
|
||||||
|
- ./keyfile:/opt/keyfile
|
||||||
|
- ./mongo-data/db:/data/db
|
||||||
|
- ./mongo-data/configdb:/data/configdb
|
||||||
|
entrypoint: /bin/bash -c
|
||||||
|
command: [ "chown mongodb /opt/keyfile && chmod 600 /opt/keyfile && . /usr/local/bin/docker-entrypoint.sh mongod --replSet rs0 --keyFile /opt/keyfile --bind_ip 0.0.0.0" ]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. В той же директории сгенерируйте общий ключ кластера командой `openssl rand -base64 756 > ./keyfile`
|
||||||
|
и скопируйте этот файл на все 3 сервера.
|
||||||
|
|
||||||
|
3. На всех 3 серверах запустите MongoDB командой `docker compose up -d mongo`.
|
||||||
|
|
||||||
|
4. Зайдите в Mongo Shell с помощью команды `docker exec -it mongo mongosh -u root -p <ВАШ_ПАРОЛЬ> localhost/admin`
|
||||||
|
и там выполните команду (заменив IP-адреса `10.10.10.{1,2,3}` на адреса своих серверов):
|
||||||
|
|
||||||
|
`rs.initiate({ _id: 'rs0', members: [
|
||||||
|
{ _id: 1, host: '10.10.10.1:27017' },
|
||||||
|
{ _id: 2, host: '10.10.10.2:27017' },
|
||||||
|
{ _id: 3, host: '10.10.10.3:27017' }
|
||||||
|
] })`
|
||||||
|
|
||||||
|
5. Находясь там же, в Mongo Shell, создайте пользователя с доступом к будущей базе данных S3:
|
||||||
|
|
||||||
|
`db.createUser({ user: 's3', pwd: '<ВАШ_ПАРОЛЬ_S3>', roles: [
|
||||||
|
{ role: 'readWrite', db: 's3' },
|
||||||
|
{ role: 'dbAdmin', db: 's3' },
|
||||||
|
{ role: 'readWrite', db: 'vitastor' },
|
||||||
|
{ role: 'dbAdmin', db: 'vitastor' }
|
||||||
|
] })`
|
||||||
|
|
||||||
|
### Настройка Vitastor
|
||||||
|
|
||||||
|
Создайте в Vitastor отдельный пул для данных объектов S3, например:
|
||||||
|
|
||||||
|
`vitastor-cli create-pool --ec 2+1 -n 512 s3-data --used_for_app s3:standard`
|
||||||
|
|
||||||
|
Опция `--used_for_app` работает как "защита от дурака" и не даёт вам случайно создать
|
||||||
|
в этом пуле обычный блочный том и перезаписать им какие-то данные S3, а также скрывает
|
||||||
|
статистику занятого места по томам S3 из etcd.
|
||||||
|
|
||||||
|
Получите ID своего пула с помощью команды `vitastor-cli ls-pools --detail`.
|
||||||
|
|
||||||
|
### Установка Vitastor S3
|
||||||
|
|
||||||
|
1. Добавьте в `docker-compose.yml` строки (альтернативно вместо `network_mode: host`
|
||||||
|
можно использовать `ports: [ "8000:8000", "8002:8002" ]`):
|
||||||
|
|
||||||
|
```
|
||||||
|
zenko:
|
||||||
|
container_name: zenko
|
||||||
|
image: vitalif/vitastor-zenko
|
||||||
|
restart: always
|
||||||
|
security_opt:
|
||||||
|
- seccomp:unconfined
|
||||||
|
ulimits:
|
||||||
|
memlock: -1
|
||||||
|
network_mode: host
|
||||||
|
volumes:
|
||||||
|
- /etc/vitastor:/etc/vitastor
|
||||||
|
- /etc/vitastor/s3:/conf
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Извлеките из Docker-образа Vitastor примеры файлов конфигурации:
|
||||||
|
`docker run --rm -it -v /etc/vitastor:/etc/vitastor -v /etc/vitastor/s3:/conf vitalif/vitastor-zenko configure.sh`
|
||||||
|
|
||||||
|
3. Отредактируйте файлы конфигурации в `/etc/vitastor/s3/`:
|
||||||
|
- `config.json` - общие настройки.
|
||||||
|
- `authdata.json` - учётные записи и ключи доступа.
|
||||||
|
- `locationConfig.json` - список классов хранения S3 с настройками расположения.
|
||||||
|
Внимание: в данной версии это именно список S3 storage class-ов (STANDARD, COLD и т.п.),
|
||||||
|
а не зон (подобных us-east-1), как в оригинальном Zenko CloudServer.
|
||||||
|
- В `config.json` и в `locationConfig.json` пропишите свои данные подключения к MongoDB.
|
||||||
|
- В `locationConfig.json` укажите ID пула Vitastor для хранения данных.
|
||||||
|
- Полный перечень настроек Vitastor-бэкенда пока можно посмотреть [в коде](https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts#L94).
|
||||||
|
|
||||||
|
### Запуск
|
||||||
|
|
||||||
|
Запустите S3-сервер: `docker-compose up -d zenko`
|
||||||
|
|
||||||
|
Готово! Вы получили S3-сервер, работающий на порту 8000.
|
||||||
|
|
||||||
|
Можете попробовать обратиться к нему с помощью, например, [s3cmd](https://s3tools.org/s3cmd):
|
||||||
|
|
||||||
|
`s3cmd --host-bucket= --no-ssl --access_key=accessKey1 --secret_key=verySecretKey1 --host=http://localhost:8000 mb s3://testbucket`
|
||||||
|
|
||||||
|
Или смонтировать его с помощью [GeeseFS](https://github.com/yandex-cloud/geesefs):
|
||||||
|
|
||||||
|
`AWS_ACCESS_KEY_ID=accessKey1 AWS_SECRET_ACCESS_KEY=verySecretKey1 geesefs --endpoint http://localhost:8000 testbucket /mnt/geesefs`
|
||||||
|
|
||||||
|
## Лицензия
|
||||||
|
|
||||||
|
- Автор [Zenko CloudServer](https://s3-server.readthedocs.io/en/latest/) - Scality, лицензия [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
- Vitastor-бэкенд для S3, как и сам Vitastor, лицензируется на условиях [VNPL 1.1](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/VNPL-1.1.txt)
|
||||||
|
- Репозиторий сборки: https://git.yourcmc.ru/vitalif/zenko-cloudserver-vitastor
|
||||||
|
- Бэкенд хранения данных: https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts
|
|
@ -16,7 +16,7 @@
|
||||||
designated initializers support from C++20
|
designated initializers support from C++20
|
||||||
- CMake
|
- CMake
|
||||||
- liburing, jerasure headers and libraries
|
- liburing, jerasure headers and libraries
|
||||||
- ISA-L, libibverbs headers and libraries (optional)
|
- ISA-L, libibverbs and librdmacm headers and libraries (optional)
|
||||||
- tcmalloc (google-perftools-dev)
|
- tcmalloc (google-perftools-dev)
|
||||||
|
|
||||||
## Basic instructions
|
## Basic instructions
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
назначенных инициализаторов (designated initializers) из C++20
|
назначенных инициализаторов (designated initializers) из C++20
|
||||||
- CMake
|
- CMake
|
||||||
- Заголовки и библиотеки liburing, jerasure
|
- Заголовки и библиотеки liburing, jerasure
|
||||||
- Опционально - заголовки и библиотеки ISA-L, libibverbs
|
- Опционально - заголовки и библиотеки ISA-L, libibverbs, librdmacm
|
||||||
- tcmalloc (google-perftools-dev)
|
- tcmalloc (google-perftools-dev)
|
||||||
|
|
||||||
## Базовая инструкция
|
## Базовая инструкция
|
||||||
|
|
|
@ -28,7 +28,7 @@
|
||||||
- Per-OSD and per-image I/O and space usage statistics in etcd
|
- Per-OSD and per-image I/O and space usage statistics in etcd
|
||||||
- Snapshots and copy-on-write image clones
|
- Snapshots and copy-on-write image clones
|
||||||
- [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
|
- [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
|
||||||
- [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
|
- RDMA/RoCEv2 support [via libibverbs](../config/network.en.md#use_rdma) or [RDMA-CM](../config/network.en.md#use_rdmacm)
|
||||||
- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
|
- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
|
||||||
- [Checksums](../config/layout-osd.en.md#data_csum_type)
|
- [Checksums](../config/layout-osd.en.md#data_csum_type)
|
||||||
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
|
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
|
||||||
|
@ -37,6 +37,7 @@
|
||||||
- [Experimental internal etcd replacement - antietcd](../config/monitor.en.md#use_antietcd)
|
- [Experimental internal etcd replacement - antietcd](../config/monitor.en.md#use_antietcd)
|
||||||
- [Built-in Prometheus metric exporter](../config/monitor.en.md#enable_prometheus)
|
- [Built-in Prometheus metric exporter](../config/monitor.en.md#enable_prometheus)
|
||||||
- [NFS RDMA support](../usage/nfs.en.md#rdma) (probably also usable for GPUDirect)
|
- [NFS RDMA support](../usage/nfs.en.md#rdma) (probably also usable for GPUDirect)
|
||||||
|
- [S3](../installation/s3.en.md)
|
||||||
|
|
||||||
## Plugins and tools
|
## Plugins and tools
|
||||||
|
|
||||||
|
@ -63,7 +64,6 @@ The following features are planned for the future:
|
||||||
- iSCSI and NVMeoF gateways
|
- iSCSI and NVMeoF gateways
|
||||||
- Multi-threaded client
|
- Multi-threaded client
|
||||||
- Faster failover
|
- Faster failover
|
||||||
- S3
|
|
||||||
- Tiered storage (SSD caching)
|
- Tiered storage (SSD caching)
|
||||||
- NVDIMM support
|
- NVDIMM support
|
||||||
- Compression (possibly)
|
- Compression (possibly)
|
||||||
|
|
|
@ -30,7 +30,7 @@
|
||||||
- Именование инодов через хранение их метаданных в etcd
|
- Именование инодов через хранение их метаданных в etcd
|
||||||
- Снапшоты и copy-on-write клоны
|
- Снапшоты и copy-on-write клоны
|
||||||
- [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
|
- [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
|
||||||
- [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
|
- Поддержка RDMA/RoCEv2 [через libibverbs](../config/network.ru.md#use_rdma) или [RDMA-CM](../config/network.ru.md#use_rdmacm)
|
||||||
- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
|
- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
|
||||||
- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
|
- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
|
||||||
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
|
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
|
||||||
|
@ -39,6 +39,7 @@
|
||||||
- [Экспериментальная встроенная замена etcd - antietcd](../config/monitor.ru.md#use_antietcd)
|
- [Экспериментальная встроенная замена etcd - antietcd](../config/monitor.ru.md#use_antietcd)
|
||||||
- [Встроенный Prometheus-экспортер метрик](../config/monitor.ru.md#enable_prometheus)
|
- [Встроенный Prometheus-экспортер метрик](../config/monitor.ru.md#enable_prometheus)
|
||||||
- [Поддержка NFS RDMA](../usage/nfs.ru.md#rdma) (вероятно, также подходящая для GPUDirect)
|
- [Поддержка NFS RDMA](../usage/nfs.ru.md#rdma) (вероятно, также подходящая для GPUDirect)
|
||||||
|
- [S3](../installation/s3.ru.md)
|
||||||
|
|
||||||
## Драйверы и инструменты
|
## Драйверы и инструменты
|
||||||
|
|
||||||
|
@ -63,7 +64,6 @@
|
||||||
- iSCSI и NVMeoF прокси
|
- iSCSI и NVMeoF прокси
|
||||||
- Многопоточный клиент
|
- Многопоточный клиент
|
||||||
- Более быстрое переключение при отказах
|
- Более быстрое переключение при отказах
|
||||||
- S3
|
|
||||||
- Поддержка SSD-кэширования (tiered storage)
|
- Поддержка SSD-кэширования (tiered storage)
|
||||||
- Поддержка NVDIMM
|
- Поддержка NVDIMM
|
||||||
- Возможно, сжатие
|
- Возможно, сжатие
|
||||||
|
|
|
@ -50,7 +50,7 @@ On the monitor hosts:
|
||||||
|
|
||||||
## Configure OSDs
|
## Configure OSDs
|
||||||
|
|
||||||
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
|
- Put etcd_address and [osd_network](../config/network.en.md#osd_network) into `/etc/vitastor/vitastor.conf`. Example:
|
||||||
```
|
```
|
||||||
{
|
{
|
||||||
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
|
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
|
||||||
|
|
|
@ -50,7 +50,7 @@
|
||||||
|
|
||||||
## Настройте OSD
|
## Настройте OSD
|
||||||
|
|
||||||
- Пропишите etcd_address и osd_network в `/etc/vitastor/vitastor.conf`. Например:
|
- Пропишите etcd_address и [osd_network](../config/network.ru.md#osd_network) в `/etc/vitastor/vitastor.conf`. Например:
|
||||||
```
|
```
|
||||||
{
|
{
|
||||||
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
|
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
|
||||||
|
|
|
@ -35,10 +35,19 @@ PG state consists of exactly 1 base state and an arbitrary number of additional
|
||||||
|
|
||||||
PG state always includes exactly 1 of the following base states:
|
PG state always includes exactly 1 of the following base states:
|
||||||
- **active** — PG is active and handles user I/O.
|
- **active** — PG is active and handles user I/O.
|
||||||
- **incomplete** — Not enough OSDs are available to activate this PG. That is, more disks
|
- **incomplete** — Not enough OSDs are available to activate this PG. More exactly, that
|
||||||
are lost than it's allowed by the pool's redundancy scheme. For example, if the pool has
|
means one of the following:
|
||||||
pg_size=3 and pg_minsize=1, part of the data may be written only to 1 OSD. If that exact
|
- Less than pg_minsize current target OSDs are available for the PG. I.e. more disks
|
||||||
OSD is lost, PG will become **incomplete**.
|
are lost than allowed by the pool's redundancy scheme.
|
||||||
|
- All OSDs of some of PG's history records are unavailable, or, for EC pools, less
|
||||||
|
than (pg_size-parity_chunks) OSDs are available in one of the history records.
|
||||||
|
In other words it means that some data in this PG was written to an OSD set such that
|
||||||
|
it's currently impossible to read it back because these OSDs are down. For example,
|
||||||
|
if the pool has pg_size=3 and pg_minsize=1, part of the data may be written only to
|
||||||
|
1 OSD. If that exact OSD is lost, PG becomes **incomplete**.
|
||||||
|
- [allow_net_split](../config/osd.en.md#allow_net_split) is disabled (default) and
|
||||||
|
primary OSD of the PG can't connect to some secondary OSDs marked as alive in etcd.
|
||||||
|
I.e. a network partition happened: OSDs can talk to etcd, but not to some other OSDs.
|
||||||
- **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
|
- **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
|
||||||
this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
|
this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
|
||||||
or the primary OSD refuses to start this PG (for example, because of wrong block_size),
|
or the primary OSD refuses to start this PG (for example, because of wrong block_size),
|
||||||
|
|
|
@ -35,10 +35,20 @@
|
||||||
|
|
||||||
Состояние PG включает в себя ровно 1 флаг из следующих:
|
Состояние PG включает в себя ровно 1 флаг из следующих:
|
||||||
- **active** — PG активна и обрабатывает запросы ввода-вывода от пользователей.
|
- **active** — PG активна и обрабатывает запросы ввода-вывода от пользователей.
|
||||||
- **incomplete** — Недостаточно живых OSD, чтобы включить эту PG.
|
- **incomplete** — Недостаточно живых OSD, чтобы включить эту PG. Если точнее, то это
|
||||||
То есть, дисков потеряно больше, чем разрешено схемой отказоустойчивости пула и pg_minsize.
|
означает один из следующих вариантов:
|
||||||
Например, если у пула pg_size=3 и pg_minsize=1, то часть данных может записаться всего на 1 OSD.
|
- Доступно менее, чем pg_minsize текущих целевых OSD данной PG. Иными словами, потеряно
|
||||||
Если потом конкретно этот OSD упадёт, PG окажется **incomplete**.
|
больше дисков, чем это разрешает схема отказоустойчивости пула.
|
||||||
|
- Все OSD одной из исторических записей PG недоступны, или, для EC-пулов, в одной
|
||||||
|
из исторических записей PG доступно менее, чем (pg_size-parity_chunks) OSD. Другими
|
||||||
|
словами это означает, что часть данных этой PG была записана в такой набор OSD, из
|
||||||
|
которого их сейчас невозможно прочитать обратно, так как OSD не включены. Например,
|
||||||
|
если у пула pg_size=3 и pg_minsize=1, то часть данных может записаться всего на 1 OSD.
|
||||||
|
Если потом конкретно этот OSD упадёт, PG окажется **incomplete**.
|
||||||
|
- [allow_net_split](../config/osd.ru.md#allow_net_split) отключено (по умолчанию) и
|
||||||
|
первичный OSD данной PG не может соединиться с частью вторичных OSD этой PG, помеченных
|
||||||
|
как живых в etcd. Это означает, что произошло разделение сети: OSD могут общаться с etcd,
|
||||||
|
но не могут общаться с частью других OSD.
|
||||||
- **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
|
- **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
|
||||||
(если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
|
(если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
|
||||||
назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
|
назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
|
||||||
|
|
|
@ -355,7 +355,7 @@ Set OSD reweight, tags or noout flag. See detail description in [OSD config docu
|
||||||
|
|
||||||
## pg-list
|
## pg-list
|
||||||
|
|
||||||
`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
|
`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs|pgs [OPTIONS] [state1+state2] [^state3] [...]`
|
||||||
|
|
||||||
List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:
|
List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:
|
||||||
|
|
||||||
|
@ -363,6 +363,7 @@ List PGs with any of listed state filters (^ or ! in the beginning is negation).
|
||||||
--pool <pool name or number> Only list PGs of the given pool.
|
--pool <pool name or number> Only list PGs of the given pool.
|
||||||
--min <min pg number> Only list PGs with number >= min.
|
--min <min pg number> Only list PGs with number >= min.
|
||||||
--max <max pg number> Only list PGs with number <= max.
|
--max <max pg number> Only list PGs with number <= max.
|
||||||
|
--osd 1,2,... Only list PGs with some data on specified OSD(s).
|
||||||
```
|
```
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
@ -377,11 +378,11 @@ Examples:
|
||||||
|
|
||||||
Create a pool. Required parameters:
|
Create a pool. Required parameters:
|
||||||
|
|
||||||
| <!-- --> | <!-- --> |
|
| <!-- --> | <!-- --> |
|
||||||
|--------------------------|---------------------------------------------------------------------------------------|
|
|--------------------------|-----------------------------------------------------------------------------------------|
|
||||||
| `-s R` or `--pg_size R` | Number of replicas for replicated pools |
|
| `-s R` or `--pg_size R` | Number of replicas for replicated pools |
|
||||||
| `--ec N+K` | Number of data (N) and parity (K) chunks for erasure-coded pools |
|
| `--ec N+K` | Number of data (N) and parity (K) chunks for erasure-coded pools |
|
||||||
| `-n N` or `--pg_count N` | PG count for the new pool (start with 10*<OSD count>/pg_size rounded to a power of 2) |
|
| `-n N` or `--pg_count N` | PG count for the new pool (start with 10*\<OSD count\>/pg_size rounded to a power of 2) |
|
||||||
|
|
||||||
Optional parameters:
|
Optional parameters:
|
||||||
|
|
||||||
|
@ -398,7 +399,8 @@ Optional parameters:
|
||||||
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
|
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
|
||||||
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
||||||
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
||||||
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |
|
| `--used_for_app fs:<name>` | Mark pool as used for VitastorFS with metadata in image `<name>` |
|
||||||
|
| `--used_for_app s3:<name>` | Mark pool as used for S3 location with name `<name>` |
|
||||||
| `--pg_stripe_size <number>` | Increase object grouping stripe |
|
| `--pg_stripe_size <number>` | Increase object grouping stripe |
|
||||||
| `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input |
|
| `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input |
|
||||||
| `--wait` | Wait for the new pool to come online |
|
| `--wait` | Wait for the new pool to come online |
|
||||||
|
|
|
@ -22,6 +22,8 @@ vitastor-cli - интерфейс командной строки для адм
|
||||||
- [flatten](#flatten)
|
- [flatten](#flatten)
|
||||||
- [rm-data](#rm-data)
|
- [rm-data](#rm-data)
|
||||||
- [merge-data](#merge-data)
|
- [merge-data](#merge-data)
|
||||||
|
- [describe](#describe)
|
||||||
|
- [fix](#fix)
|
||||||
- [alloc-osd](#alloc-osd)
|
- [alloc-osd](#alloc-osd)
|
||||||
- [rm-osd](#rm-osd)
|
- [rm-osd](#rm-osd)
|
||||||
- [osd-tree](#osd-tree)
|
- [osd-tree](#osd-tree)
|
||||||
|
@ -375,9 +377,10 @@ OSD PARENT UP SIZE USED% TAGS WEIGHT BLOCK BITMAP
|
||||||
в начале фильтра означает отрицание). Опции:
|
в начале фильтра означает отрицание). Опции:
|
||||||
|
|
||||||
```
|
```
|
||||||
--pool <pool name or number> Only list PGs of the given pool.
|
--pool <pool name or number> Вывести только PG в заданном пуле.
|
||||||
--min <min pg number> Only list PGs with number >= min.
|
--min <min pg number> Вывести только PG с номерами >= min.
|
||||||
--max <max pg number> Only list PGs with number <= max.
|
--max <max pg number> Вывести только PG с номерами <= max.
|
||||||
|
--osd 1,2,... Вывести только PG с данными на заданных OSD.
|
||||||
```
|
```
|
||||||
|
|
||||||
Примеры:
|
Примеры:
|
||||||
|
@ -392,11 +395,11 @@ OSD PARENT UP SIZE USED% TAGS WEIGHT BLOCK BITMAP
|
||||||
|
|
||||||
Создать пул. Обязательные параметры:
|
Создать пул. Обязательные параметры:
|
||||||
|
|
||||||
| <!-- --> | <!-- --> |
|
| <!-- --> | <!-- --> |
|
||||||
|---------------------------|---------------------------------------------------------------------------------------------|
|
|---------------------------|-----------------------------------------------------------------------------------------------|
|
||||||
| `-s R` или `--pg_size R` | Число копий данных для реплицированных пулов |
|
| `-s R` или `--pg_size R` | Число копий данных для реплицированных пулов |
|
||||||
| `--ec N+K` | Число частей данных (N) и чётности (K) для пулов с кодами коррекции ошибок |
|
| `--ec N+K` | Число частей данных (N) и чётности (K) для пулов с кодами коррекции ошибок |
|
||||||
| `-n N` или `--pg_count N` | Число PG для нового пула (начните с 10*<число OSD>/pg_size, округлённого до степени двойки) |
|
| `-n N` или `--pg_count N` | Число PG для нового пула (начните с 10*\<число OSD\>/pg_size, округлённого до степени двойки) |
|
||||||
|
|
||||||
Необязательные параметры:
|
Необязательные параметры:
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ It supports the following commands:
|
||||||
- [upgrade-simple](#upgrade-simple)
|
- [upgrade-simple](#upgrade-simple)
|
||||||
- [resize](#resize)
|
- [resize](#resize)
|
||||||
- [raw-resize](#raw-resize)
|
- [raw-resize](#raw-resize)
|
||||||
|
- [trim](#trim)
|
||||||
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
|
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
|
||||||
- [purge](#purge)
|
- [purge](#purge)
|
||||||
- [read-sb](#read-sb)
|
- [read-sb](#read-sb)
|
||||||
|
@ -97,6 +98,9 @@ Options (both modes):
|
||||||
--data_device_block 4k Override data device block size
|
--data_device_block 4k Override data device block size
|
||||||
--meta_device_block 4k Override metadata device block size
|
--meta_device_block 4k Override metadata device block size
|
||||||
--journal_device_block 4k Override journal device block size
|
--journal_device_block 4k Override journal device block size
|
||||||
|
--discard_on_start 0 TRIM unused data device blocks every OSD start (default off)
|
||||||
|
--min_discard_size 1M Minimum TRIM block size
|
||||||
|
--json Enable JSON output
|
||||||
```
|
```
|
||||||
|
|
||||||
[immediate_commit](../config/layout-cluster.en.md#immediate_commit) setting is
|
[immediate_commit](../config/layout-cluster.en.md#immediate_commit) setting is
|
||||||
|
@ -179,6 +183,19 @@ parameters from OSD command line (i.e. from systemd unit or superblock).
|
||||||
SIZE may include k/m/g/t suffixes. If any of the new layout parameter
|
SIZE may include k/m/g/t suffixes. If any of the new layout parameter
|
||||||
options are not specified, old values will be used.
|
options are not specified, old values will be used.
|
||||||
|
|
||||||
|
## trim
|
||||||
|
|
||||||
|
`vitastor-disk trim <osd_num>|<osd_device> [<osd_num>|<osd_device>...]`
|
||||||
|
|
||||||
|
Try to discard unused blocks (SSD TRIM) on the data device of each of the OSD(s).
|
||||||
|
|
||||||
|
May only be used on stopped OSDs. Options:
|
||||||
|
|
||||||
|
```
|
||||||
|
--min_discard_size 1M Minimum TRIM block size
|
||||||
|
--discard_granularity 0 Override device's discard granularity
|
||||||
|
```
|
||||||
|
|
||||||
## start/stop/restart/enable/disable
|
## start/stop/restart/enable/disable
|
||||||
|
|
||||||
`vitastor-disk start|stop|restart|enable|disable [--now] <device> [device2 device3 ...]`
|
`vitastor-disk start|stop|restart|enable|disable [--now] <device> [device2 device3 ...]`
|
||||||
|
|
|
@ -99,6 +99,9 @@ vitastor-disk - инструмент командной строки для уп
|
||||||
--data_device_block 4k Задать размер блока устройства данных
|
--data_device_block 4k Задать размер блока устройства данных
|
||||||
--meta_device_block 4k Задать размер блока метаданных
|
--meta_device_block 4k Задать размер блока метаданных
|
||||||
--journal_device_block 4k Задать размер блока журнала
|
--journal_device_block 4k Задать размер блока журнала
|
||||||
|
--discard_on_start 0 Выполнять TRIM пустых блоков данных при запуске OSD (по умолчанию нет)
|
||||||
|
--min_discard_size 1M Минимальный размер блока для TRIM
|
||||||
|
--json Включить JSON-вывод
|
||||||
```
|
```
|
||||||
|
|
||||||
Настройка [immediate_commit](../config/layout-cluster.ru.md#immediate_commit)
|
Настройка [immediate_commit](../config/layout-cluster.ru.md#immediate_commit)
|
||||||
|
@ -182,6 +185,20 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
|
||||||
`РАЗМЕР` может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
|
`РАЗМЕР` может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
|
||||||
расположения не указан, он принимается равным старому значению.
|
расположения не указан, он принимается равным старому значению.
|
||||||
|
|
||||||
|
## trim
|
||||||
|
|
||||||
|
`vitastor-disk trim <osd_num>|<osd_device> [<osd_num>|<osd_device>...]`
|
||||||
|
|
||||||
|
Попробовать пометить пустые блоки дисков данных всех указанных OSD неиспользуемыми
|
||||||
|
(выполнить команду SSD TRIM).
|
||||||
|
|
||||||
|
Можно использовать только с остановленными OSD. Опции:
|
||||||
|
|
||||||
|
```
|
||||||
|
--min_discard_size 1M Минимальный размер блока для TRIM
|
||||||
|
--discard_granularity 0 Кратность размера блока для TRIM
|
||||||
|
```
|
||||||
|
|
||||||
## start/stop/restart/enable/disable
|
## start/stop/restart/enable/disable
|
||||||
|
|
||||||
`vitastor-disk start|stop|restart|enable|disable [--now] <device> [device2 device3 ...]`
|
`vitastor-disk start|stop|restart|enable|disable [--now] <device> [device2 device3 ...]`
|
||||||
|
|
|
@ -58,7 +58,7 @@ To use VitastorFS:
|
||||||
2. Create an image for FS metadata, preferably in a faster (SSD or replica-HDD) pool,
|
2. Create an image for FS metadata, preferably in a faster (SSD or replica-HDD) pool,
|
||||||
but you can create it in the data pool too if you want (image size doesn't matter):
|
but you can create it in the data pool too if you want (image size doesn't matter):
|
||||||
`vitastor-cli create -s 10G -p fastpool testfs`
|
`vitastor-cli create -s 10G -p fastpool testfs`
|
||||||
3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
|
3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-app fs:testfs data-pool`
|
||||||
4. Either mount the FS: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
4. Either mount the FS: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
||||||
5. Or start the NFS server: `vitastor-nfs start --fs testfs --pool data-pool`
|
5. Or start the NFS server: `vitastor-nfs start --fs testfs --pool data-pool`
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ JSON-формате :-). Для инспекции содержимого БД
|
||||||
или по крайней мере на HDD, но без EC), но можно и в том же пуле, что данные
|
или по крайней мере на HDD, но без EC), но можно и в том же пуле, что данные
|
||||||
(размер образа значения не имеет):
|
(размер образа значения не имеет):
|
||||||
`vitastor-cli create -s 10G -p fastpool testfs`
|
`vitastor-cli create -s 10G -p fastpool testfs`
|
||||||
3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
|
3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-app fs:testfs data-pool`
|
||||||
4. Либо примонтируйте ФС: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
4. Либо примонтируйте ФС: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
||||||
5. Либо запустите сетевой NFS-сервер: `vitastor-nfs start --fs testfs --pool data-pool`
|
5. Либо запустите сетевой NFS-сервер: `vitastor-nfs start --fs testfs --pool data-pool`
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
|
||||||
const stat = state.osd.stats[osd_num];
|
const stat = state.osd.stats[osd_num];
|
||||||
const osd_cfg = state.config.osd[osd_num];
|
const osd_cfg = state.config.osd[osd_num];
|
||||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
||||||
if (reweight < 0 || isNaN(reweight))
|
if (isNaN(reweight) || reweight < 0 || reweight > 0)
|
||||||
reweight = 1;
|
reweight = 1;
|
||||||
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
||||||
osd_cfg && osd_cfg.noout))
|
osd_cfg && osd_cfg.noout))
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "vitastor-mon",
|
"name": "vitastor-mon",
|
||||||
"version": "1.11.0",
|
"version": "2.1.0",
|
||||||
"description": "Vitastor SDS monitor service",
|
"description": "Vitastor SDS monitor service",
|
||||||
"main": "mon-main.js",
|
"main": "mon-main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
@ -19,6 +19,6 @@
|
||||||
"eslint-plugin-node": "^11.1.0"
|
"eslint-plugin-node": "^11.1.0"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=12.0.0"
|
"node": ">=12.1.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,11 +19,16 @@
|
||||||
class NodeVitastorRequest: public Nan::AsyncResource
|
class NodeVitastorRequest: public Nan::AsyncResource
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
NodeVitastorRequest(NodeVitastor *cli, v8::Local<v8::Function> cb): Nan::AsyncResource("NodeVitastorRequest")
|
NodeVitastorRequest(NodeVitastor *cli, const v8::Local<v8::Function> & cb): Nan::AsyncResource("NodeVitastorRequest")
|
||||||
{
|
{
|
||||||
this->cli = cli;
|
this->cli = cli;
|
||||||
callback.Reset(cb);
|
callback.Reset(cb);
|
||||||
}
|
}
|
||||||
|
~NodeVitastorRequest()
|
||||||
|
{
|
||||||
|
callback.Reset();
|
||||||
|
buffer_ref.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
iovec iov;
|
iovec iov;
|
||||||
std::vector<iovec> iov_list;
|
std::vector<iovec> iov_list;
|
||||||
|
@ -33,6 +38,7 @@ public:
|
||||||
uint64_t offset = 0, len = 0, version = 0;
|
uint64_t offset = 0, len = 0, version = 0;
|
||||||
bool with_parents = false;
|
bool with_parents = false;
|
||||||
Nan::Persistent<v8::Function> callback;
|
Nan::Persistent<v8::Function> callback;
|
||||||
|
Nan::Persistent<v8::Value> buffer_ref;
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint64_t get_ui64(const v8::Local<v8::Value> & val)
|
static uint64_t get_ui64(const v8::Local<v8::Value> & val)
|
||||||
|
@ -83,7 +89,6 @@ NAN_METHOD(NodeVitastor::Create)
|
||||||
delete[] c_cfg;
|
delete[] c_cfg;
|
||||||
if (!cli->c)
|
if (!cli->c)
|
||||||
{
|
{
|
||||||
ERRORF("NodeVitastor: failed to initialize io_uring (old kernel or insufficient ulimit -l?)");
|
|
||||||
Nan::ThrowError("failed to initialize io_uring (old kernel or insufficient ulimit -l?)");
|
Nan::ThrowError("failed to initialize io_uring (old kernel or insufficient ulimit -l?)");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -130,8 +135,8 @@ NodeVitastorRequest* NodeVitastor::get_read_request(const Nan::FunctionCallbackI
|
||||||
Nan::ThrowError("failed to allocate memory");
|
Nan::ThrowError("failed to allocate memory");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
|
|
||||||
auto req = new NodeVitastorRequest(this, callback);
|
auto req = new NodeVitastorRequest(this, info[argpos+2].As<v8::Function>());
|
||||||
|
|
||||||
req->offset = offset;
|
req->offset = offset;
|
||||||
req->len = len;
|
req->len = len;
|
||||||
|
@ -156,6 +161,9 @@ NAN_METHOD(NodeVitastor::Read)
|
||||||
|
|
||||||
self->Ref();
|
self->Ref();
|
||||||
vitastor_c_read(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, &req->iov, 1, on_read_finish, req);
|
vitastor_c_read(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, &req->iov, 1, on_read_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(self->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
||||||
|
@ -175,11 +183,11 @@ NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallback
|
||||||
argpos++;
|
argpos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
|
auto req = new NodeVitastorRequest(this, info[argpos+2].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(this, callback);
|
|
||||||
|
|
||||||
req->offset = offset;
|
req->offset = offset;
|
||||||
req->version = version;
|
req->version = version;
|
||||||
|
req->buffer_ref.Reset(bufarg);
|
||||||
|
|
||||||
if (bufarg->IsArray())
|
if (bufarg->IsArray())
|
||||||
{
|
{
|
||||||
|
@ -224,6 +232,9 @@ NAN_METHOD(NodeVitastor::Write)
|
||||||
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
||||||
req->iov_list.size() ? req->iov_list.size() : 1,
|
req->iov_list.size() ? req->iov_list.size() : 1,
|
||||||
on_write_finish, req);
|
on_write_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(self->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeVitastorRequest* NodeVitastor::get_delete_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
NodeVitastorRequest* NodeVitastor::get_delete_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
||||||
|
@ -243,8 +254,7 @@ NodeVitastorRequest* NodeVitastor::get_delete_request(const Nan::FunctionCallbac
|
||||||
argpos++;
|
argpos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
|
auto req = new NodeVitastorRequest(this, info[argpos+2].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(this, callback);
|
|
||||||
|
|
||||||
req->offset = offset;
|
req->offset = offset;
|
||||||
req->len = len;
|
req->len = len;
|
||||||
|
@ -270,6 +280,9 @@ NAN_METHOD(NodeVitastor::Delete)
|
||||||
self->Ref();
|
self->Ref();
|
||||||
vitastor_c_delete(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, req->version,
|
vitastor_c_delete(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, req->version,
|
||||||
on_write_finish, req);
|
on_write_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(self->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// sync(callback(err))
|
// sync(callback(err))
|
||||||
|
@ -281,11 +294,13 @@ NAN_METHOD(NodeVitastor::Sync)
|
||||||
|
|
||||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
auto req = new NodeVitastorRequest(self, info[0].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(self, callback);
|
|
||||||
|
|
||||||
self->Ref();
|
self->Ref();
|
||||||
vitastor_c_sync(self->c, on_write_finish, req);
|
vitastor_c_sync(self->c, on_write_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(self->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// read_bitmap(pool, inode, offset, length, with_parents, callback(err, bitmap_buffer))
|
// read_bitmap(pool, inode, offset, length, with_parents, callback(err, bitmap_buffer))
|
||||||
|
@ -302,11 +317,13 @@ NAN_METHOD(NodeVitastor::ReadBitmap)
|
||||||
uint64_t offset = get_ui64(info[2]);
|
uint64_t offset = get_ui64(info[2]);
|
||||||
uint64_t len = get_ui64(info[3]);
|
uint64_t len = get_ui64(info[3]);
|
||||||
bool with_parents = Nan::To<bool>(info[4]).FromJust();
|
bool with_parents = Nan::To<bool>(info[4]).FromJust();
|
||||||
v8::Local<v8::Function> callback = info[5].As<v8::Function>();
|
|
||||||
|
|
||||||
auto req = new NodeVitastorRequest(self, callback);
|
auto req = new NodeVitastorRequest(self, info[5].As<v8::Function>());
|
||||||
self->Ref();
|
self->Ref();
|
||||||
vitastor_c_read_bitmap(self->c, ((pool << (64-POOL_ID_BITS)) | inode), offset, len, with_parents, on_read_bitmap_finish, req);
|
vitastor_c_read_bitmap(self->c, ((pool << (64-POOL_ID_BITS)) | inode), offset, len, with_parents, on_read_bitmap_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(self->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long retval)
|
static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long retval)
|
||||||
|
@ -327,10 +344,12 @@ NAN_METHOD(NodeVitastor::OnReady)
|
||||||
if (info.Length() < 1)
|
if (info.Length() < 1)
|
||||||
Nan::ThrowError("Not enough arguments to on_ready(callback(err))");
|
Nan::ThrowError("Not enough arguments to on_ready(callback(err))");
|
||||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
auto req = new NodeVitastorRequest(self, info[0].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(self, callback);
|
|
||||||
self->Ref();
|
self->Ref();
|
||||||
vitastor_c_on_ready(self->c, on_ready_finish, req);
|
vitastor_c_on_ready(self->c, on_ready_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(self->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void NodeVitastor::on_ready_finish(void *opaque, long retval)
|
void NodeVitastor::on_ready_finish(void *opaque, long retval)
|
||||||
|
@ -475,6 +494,9 @@ NAN_METHOD(NodeVitastorImage::Create)
|
||||||
img->Ref();
|
img->Ref();
|
||||||
cli->Ref();
|
cli->Ref();
|
||||||
vitastor_c_watch_inode(cli->c, (char*)img->name.c_str(), on_watch_start, img);
|
vitastor_c_watch_inode(cli->c, (char*)img->name.c_str(), on_watch_start, img);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(cli->c);
|
||||||
|
#endif
|
||||||
|
|
||||||
info.GetReturnValue().Set(info.This());
|
info.GetReturnValue().Set(info.This());
|
||||||
}
|
}
|
||||||
|
@ -546,8 +568,7 @@ NAN_METHOD(NodeVitastorImage::Sync)
|
||||||
|
|
||||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
auto req = new NodeVitastorRequest(img->cli, info[0].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(img->cli, callback);
|
|
||||||
req->img = img;
|
req->img = img;
|
||||||
req->op = NODE_VITASTOR_SYNC;
|
req->op = NODE_VITASTOR_SYNC;
|
||||||
|
|
||||||
|
@ -566,9 +587,8 @@ NAN_METHOD(NodeVitastorImage::ReadBitmap)
|
||||||
uint64_t offset = get_ui64(info[0]);
|
uint64_t offset = get_ui64(info[0]);
|
||||||
uint64_t len = get_ui64(info[1]);
|
uint64_t len = get_ui64(info[1]);
|
||||||
bool with_parents = Nan::To<bool>(info[2]).FromJust();
|
bool with_parents = Nan::To<bool>(info[2]).FromJust();
|
||||||
v8::Local<v8::Function> callback = info[3].As<v8::Function>();
|
|
||||||
|
|
||||||
auto req = new NodeVitastorRequest(img->cli, callback);
|
auto req = new NodeVitastorRequest(img->cli, info[3].As<v8::Function>());
|
||||||
req->img = img;
|
req->img = img;
|
||||||
req->op = NODE_VITASTOR_READ_BITMAP;
|
req->op = NODE_VITASTOR_READ_BITMAP;
|
||||||
req->offset = offset;
|
req->offset = offset;
|
||||||
|
@ -587,8 +607,7 @@ NAN_METHOD(NodeVitastorImage::GetInfo)
|
||||||
|
|
||||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
auto req = new NodeVitastorRequest(img->cli, info[0].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(img->cli, callback);
|
|
||||||
req->img = img;
|
req->img = img;
|
||||||
req->op = NODE_VITASTOR_GET_INFO;
|
req->op = NODE_VITASTOR_GET_INFO;
|
||||||
|
|
||||||
|
@ -615,6 +634,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||||
cli->Ref();
|
cli->Ref();
|
||||||
vitastor_c_read(cli->c, ino, req->offset, req->len, &req->iov, 1, NodeVitastor::on_read_finish, req);
|
vitastor_c_read(cli->c, ino, req->offset, req->len, &req->iov, 1, NodeVitastor::on_read_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else if (req->op == NODE_VITASTOR_WRITE)
|
else if (req->op == NODE_VITASTOR_WRITE)
|
||||||
{
|
{
|
||||||
|
@ -624,6 +646,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||||
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
||||||
req->iov_list.size() ? req->iov_list.size() : 1,
|
req->iov_list.size() ? req->iov_list.size() : 1,
|
||||||
NodeVitastor::on_write_finish, req);
|
NodeVitastor::on_write_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else if (req->op == NODE_VITASTOR_DELETE)
|
else if (req->op == NODE_VITASTOR_DELETE)
|
||||||
{
|
{
|
||||||
|
@ -631,6 +656,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||||
cli->Ref();
|
cli->Ref();
|
||||||
vitastor_c_delete(cli->c, ino, req->offset, req->len, req->version,
|
vitastor_c_delete(cli->c, ino, req->offset, req->len, req->version,
|
||||||
NodeVitastor::on_write_finish, req);
|
NodeVitastor::on_write_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else if (req->op == NODE_VITASTOR_SYNC)
|
else if (req->op == NODE_VITASTOR_SYNC)
|
||||||
{
|
{
|
||||||
|
@ -640,6 +668,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||||
if (imm != IMMEDIATE_ALL)
|
if (imm != IMMEDIATE_ALL)
|
||||||
{
|
{
|
||||||
vitastor_c_sync(cli->c, NodeVitastor::on_write_finish, req);
|
vitastor_c_sync(cli->c, NodeVitastor::on_write_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -651,6 +682,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||||
cli->Ref();
|
cli->Ref();
|
||||||
vitastor_c_read_bitmap(cli->c, ino, req->offset, req->len, req->with_parents, NodeVitastor::on_read_bitmap_finish, req);
|
vitastor_c_read_bitmap(cli->c, ino, req->offset, req->len, req->with_parents, NodeVitastor::on_read_bitmap_finish, req);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else if (req->op == NODE_VITASTOR_GET_INFO)
|
else if (req->op == NODE_VITASTOR_GET_INFO)
|
||||||
{
|
{
|
||||||
|
@ -768,8 +802,7 @@ NAN_METHOD(NodeVitastorKV::Open)
|
||||||
cfg[std::string(*Nan::Utf8String(key))] = std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked()));
|
cfg[std::string(*Nan::Utf8String(key))] = std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked()));
|
||||||
}
|
}
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[3].As<v8::Function>();
|
auto req = new NodeVitastorRequest(kv->cli, info[3].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(kv->cli, callback);
|
|
||||||
|
|
||||||
kv->Ref();
|
kv->Ref();
|
||||||
kv->dbw->open(inode_id, cfg, [kv, req](int res)
|
kv->dbw->open(inode_id, cfg, [kv, req](int res)
|
||||||
|
@ -782,6 +815,9 @@ NAN_METHOD(NodeVitastorKV::Open)
|
||||||
delete req;
|
delete req;
|
||||||
kv->Unref();
|
kv->Unref();
|
||||||
});
|
});
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(kv->cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// close(callback(err))
|
// close(callback(err))
|
||||||
|
@ -793,8 +829,7 @@ NAN_METHOD(NodeVitastorKV::Close)
|
||||||
|
|
||||||
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
auto req = new NodeVitastorRequest(kv->cli, info[0].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(kv->cli, callback);
|
|
||||||
|
|
||||||
kv->Ref();
|
kv->Ref();
|
||||||
kv->dbw->close([kv, req]()
|
kv->dbw->close([kv, req]()
|
||||||
|
@ -805,6 +840,9 @@ NAN_METHOD(NodeVitastorKV::Close)
|
||||||
delete req;
|
delete req;
|
||||||
kv->Unref();
|
kv->Unref();
|
||||||
});
|
});
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(kv->cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// set_config({ ...config })
|
// set_config({ ...config })
|
||||||
|
@ -848,8 +886,7 @@ void NodeVitastorKV::get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info,
|
||||||
// FIXME: Handle Buffer too
|
// FIXME: Handle Buffer too
|
||||||
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[1].As<v8::Function>();
|
auto req = new NodeVitastorRequest(kv->cli, info[1].As<v8::Function>());
|
||||||
auto req = new NodeVitastorRequest(kv->cli, callback);
|
|
||||||
|
|
||||||
kv->Ref();
|
kv->Ref();
|
||||||
kv->dbw->get(key, [kv, req](int res, const std::string & value)
|
kv->dbw->get(key, [kv, req](int res, const std::string & value)
|
||||||
|
@ -863,6 +900,9 @@ void NodeVitastorKV::get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info,
|
||||||
delete req;
|
delete req;
|
||||||
kv->Unref();
|
kv->Unref();
|
||||||
}, allow_cache);
|
}, allow_cache);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(kv->cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// get(key, callback(err, value))
|
// get(key, callback(err, value))
|
||||||
|
@ -911,14 +951,12 @@ NAN_METHOD(NodeVitastorKV::Set)
|
||||||
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
||||||
std::string value(*Nan::Utf8String(info[1].As<v8::String>()));
|
std::string value(*Nan::Utf8String(info[1].As<v8::String>()));
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[2].As<v8::Function>();
|
NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, info[2].As<v8::Function>()), *cas_req = NULL;
|
||||||
NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
|
|
||||||
|
|
||||||
std::function<bool(int, const std::string &)> cas_cb;
|
std::function<bool(int, const std::string &)> cas_cb;
|
||||||
if (info.Length() > 3 && info[3]->IsObject())
|
if (info.Length() > 3 && info[3]->IsObject())
|
||||||
{
|
{
|
||||||
v8::Local<v8::Function> cas_callback = info[3].As<v8::Function>();
|
cas_req = new NodeVitastorRequest(kv->cli, info[3].As<v8::Function>());
|
||||||
cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
|
|
||||||
cas_cb = make_cas_callback(cas_req);
|
cas_cb = make_cas_callback(cas_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -935,6 +973,9 @@ NAN_METHOD(NodeVitastorKV::Set)
|
||||||
delete cas_req;
|
delete cas_req;
|
||||||
kv->Unref();
|
kv->Unref();
|
||||||
}, cas_cb);
|
}, cas_cb);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(kv->cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// del(key, callback(err), cas_compare(old_value)?)
|
// del(key, callback(err), cas_compare(old_value)?)
|
||||||
|
@ -949,14 +990,12 @@ NAN_METHOD(NodeVitastorKV::Del)
|
||||||
// FIXME: Handle Buffer too
|
// FIXME: Handle Buffer too
|
||||||
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
|
||||||
|
|
||||||
v8::Local<v8::Function> callback = info[1].As<v8::Function>();
|
NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, info[1].As<v8::Function>()), *cas_req = NULL;
|
||||||
NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
|
|
||||||
|
|
||||||
std::function<bool(int, const std::string &)> cas_cb;
|
std::function<bool(int, const std::string &)> cas_cb;
|
||||||
if (info.Length() > 2 && info[2]->IsObject())
|
if (info.Length() > 2 && info[2]->IsObject())
|
||||||
{
|
{
|
||||||
v8::Local<v8::Function> cas_callback = info[2].As<v8::Function>();
|
cas_req = new NodeVitastorRequest(kv->cli, info[2].As<v8::Function>());
|
||||||
cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
|
|
||||||
cas_cb = make_cas_callback(cas_req);
|
cas_cb = make_cas_callback(cas_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -973,6 +1012,9 @@ NAN_METHOD(NodeVitastorKV::Del)
|
||||||
delete cas_req;
|
delete cas_req;
|
||||||
kv->Unref();
|
kv->Unref();
|
||||||
}, cas_cb);
|
}, cas_cb);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(kv->cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// list(start_key?)
|
// list(start_key?)
|
||||||
|
@ -1052,12 +1094,11 @@ NAN_METHOD(NodeVitastorKVListing::Next)
|
||||||
|
|
||||||
if (info.Length() > 0)
|
if (info.Length() > 0)
|
||||||
{
|
{
|
||||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
|
||||||
if (list->iter)
|
if (list->iter)
|
||||||
{
|
{
|
||||||
delete list->iter;
|
delete list->iter;
|
||||||
}
|
}
|
||||||
list->iter = new NodeVitastorRequest(list->kv->cli, callback);
|
list->iter = new NodeVitastorRequest(list->kv->cli, info[0].As<v8::Function>());
|
||||||
}
|
}
|
||||||
if (!list->handle)
|
if (!list->handle)
|
||||||
{
|
{
|
||||||
|
@ -1093,6 +1134,9 @@ NAN_METHOD(NodeVitastorKVListing::Next)
|
||||||
list->iter = req;
|
list->iter = req;
|
||||||
list->kv->Unref();
|
list->kv->Unref();
|
||||||
});
|
});
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
|
vitastor_c_uring_handle_events(list->kv->cli->c);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// close()
|
// close()
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "vitastor",
|
"name": "vitastor",
|
||||||
"version": "1.7.0",
|
"version": "2.1.0",
|
||||||
"description": "Low-level native bindings to Vitastor client library",
|
"description": "Low-level native bindings to Vitastor client library",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
|
@ -16,7 +16,7 @@
|
||||||
"build": "node-gyp rebuild"
|
"build": "node-gyp rebuild"
|
||||||
},
|
},
|
||||||
"author": "Vitaliy Filippov",
|
"author": "Vitaliy Filippov",
|
||||||
"license": "VNPL-2.0",
|
"license": "VNPL-1.1",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"bindings": "1.5.0",
|
"bindings": "1.5.0",
|
||||||
"nan": "^2.19.0"
|
"nan": "^2.19.0"
|
||||||
|
|
|
@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VITASTOR_VERSION = '1.11.0'
|
VITASTOR_VERSION = '2.1.0'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,172 @@
|
||||||
|
Index: pve-qemu-kvm-9.2.0/block/meson.build
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.2.0.orig/block/meson.build
|
||||||
|
+++ pve-qemu-kvm-9.2.0/block/meson.build
|
||||||
|
@@ -126,6 +126,7 @@ foreach m : [
|
||||||
|
[libnfs, 'nfs', files('nfs.c')],
|
||||||
|
[libssh, 'ssh', files('ssh.c')],
|
||||||
|
[rbd, 'rbd', files('rbd.c')],
|
||||||
|
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||||
|
]
|
||||||
|
if m[0].found()
|
||||||
|
module_ss = ss.source_set()
|
||||||
|
Index: pve-qemu-kvm-9.2.0/meson.build
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.2.0.orig/meson.build
|
||||||
|
+++ pve-qemu-kvm-9.2.0/meson.build
|
||||||
|
@@ -1590,6 +1590,26 @@ if not get_option('rbd').auto() or have_
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
+vitastor = not_found
|
||||||
|
+if not get_option('vitastor').auto() or have_block
|
||||||
|
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||||
|
+ required: get_option('vitastor'))
|
||||||
|
+ if libvitastor_client.found()
|
||||||
|
+ if cc.links('''
|
||||||
|
+ #include <vitastor_c.h>
|
||||||
|
+ int main(void) {
|
||||||
|
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
+ return 0;
|
||||||
|
+ }''', dependencies: libvitastor_client)
|
||||||
|
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||||
|
+ elif get_option('vitastor').enabled()
|
||||||
|
+ error('could not link libvitastor_client')
|
||||||
|
+ else
|
||||||
|
+ warning('could not link libvitastor_client, disabling')
|
||||||
|
+ endif
|
||||||
|
+ endif
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
glusterfs = not_found
|
||||||
|
glusterfs_ftruncate_has_stat = false
|
||||||
|
glusterfs_iocb_has_stat = false
|
||||||
|
@@ -2478,6 +2498,7 @@ endif
|
||||||
|
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||||
|
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||||
|
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||||
|
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||||
|
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||||
|
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||||
|
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||||
|
@@ -4789,6 +4810,7 @@ summary_info += {'fdt support': fd
|
||||||
|
summary_info += {'libcap-ng support': libcap_ng}
|
||||||
|
summary_info += {'bpf support': libbpf}
|
||||||
|
summary_info += {'rbd support': rbd}
|
||||||
|
+summary_info += {'vitastor support': vitastor}
|
||||||
|
summary_info += {'smartcard support': cacard}
|
||||||
|
summary_info += {'U2F support': u2f}
|
||||||
|
summary_info += {'libusb': libusb}
|
||||||
|
Index: pve-qemu-kvm-9.2.0/meson_options.txt
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.2.0.orig/meson_options.txt
|
||||||
|
+++ pve-qemu-kvm-9.2.0/meson_options.txt
|
||||||
|
@@ -200,6 +200,8 @@ option('lzo', type : 'feature', value :
|
||||||
|
description: 'lzo compression support')
|
||||||
|
option('rbd', type : 'feature', value : 'auto',
|
||||||
|
description: 'Ceph block device driver')
|
||||||
|
+option('vitastor', type : 'feature', value : 'auto',
|
||||||
|
+ description: 'Vitastor block device driver')
|
||||||
|
option('opengl', type : 'feature', value : 'auto',
|
||||||
|
description: 'OpenGL support')
|
||||||
|
option('rdma', type : 'feature', value : 'auto',
|
||||||
|
Index: pve-qemu-kvm-9.2.0/qapi/block-core.json
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.2.0.orig/qapi/block-core.json
|
||||||
|
+++ pve-qemu-kvm-9.2.0/qapi/block-core.json
|
||||||
|
@@ -3481,7 +3481,7 @@
|
||||||
|
'raw', 'rbd',
|
||||||
|
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||||
|
'pbs',
|
||||||
|
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||||
|
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||||
|
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -4592,6 +4592,28 @@
|
||||||
|
'*server': ['InetSocketAddressBase'] } }
|
||||||
|
|
||||||
|
##
|
||||||
|
+# @BlockdevOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific block device options for vitastor
|
||||||
|
+#
|
||||||
|
+# @image: Image name
|
||||||
|
+# @inode: Inode number
|
||||||
|
+# @pool: Pool ID
|
||||||
|
+# @size: Desired image size in bytes
|
||||||
|
+# @config-path: Path to Vitastor configuration
|
||||||
|
+# @etcd-host: etcd connection address(es)
|
||||||
|
+# @etcd-prefix: etcd key/value prefix
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'data': { '*inode': 'uint64',
|
||||||
|
+ '*pool': 'uint64',
|
||||||
|
+ '*size': 'uint64',
|
||||||
|
+ '*image': 'str',
|
||||||
|
+ '*config-path': 'str',
|
||||||
|
+ '*etcd-host': 'str',
|
||||||
|
+ '*etcd-prefix': 'str' } }
|
||||||
|
+
|
||||||
|
+##
|
||||||
|
# @ReplicationMode:
|
||||||
|
#
|
||||||
|
# An enumeration of replication modes.
|
||||||
|
@@ -5054,6 +5076,7 @@
|
||||||
|
'throttle': 'BlockdevOptionsThrottle',
|
||||||
|
'vdi': 'BlockdevOptionsGenericFormat',
|
||||||
|
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||||
|
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||||
|
'virtio-blk-vfio-pci':
|
||||||
|
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||||
|
'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -5501,6 +5524,20 @@
|
||||||
|
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||||
|
|
||||||
|
##
|
||||||
|
+# @BlockdevCreateOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific image creation options for Vitastor.
|
||||||
|
+#
|
||||||
|
+# @location: Where to store the new image file. This location cannot
|
||||||
|
+# point to a snapshot.
|
||||||
|
+#
|
||||||
|
+# @size: Size of the virtual disk in bytes
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||||
|
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'size': 'size' } }
|
||||||
|
+
|
||||||
|
+##
|
||||||
|
# @BlockdevVmdkSubformat:
|
||||||
|
#
|
||||||
|
# Subformat options for VMDK images
|
||||||
|
@@ -5722,6 +5759,7 @@
|
||||||
|
'ssh': 'BlockdevCreateOptionsSsh',
|
||||||
|
'vdi': 'BlockdevCreateOptionsVdi',
|
||||||
|
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||||
|
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||||
|
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||||
|
'vpc': 'BlockdevCreateOptionsVpc'
|
||||||
|
} }
|
||||||
|
Index: pve-qemu-kvm-9.2.0/scripts/meson-buildoptions.sh
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.2.0.orig/scripts/meson-buildoptions.sh
|
||||||
|
+++ pve-qemu-kvm-9.2.0/scripts/meson-buildoptions.sh
|
||||||
|
@@ -174,6 +174,7 @@ meson_options_help() {
|
||||||
|
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||||
|
printf "%s\n" ' qpl Query Processing Library support'
|
||||||
|
printf "%s\n" ' rbd Ceph block device driver'
|
||||||
|
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||||
|
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||||
|
printf "%s\n" ' replication replication support'
|
||||||
|
printf "%s\n" ' rust Rust support'
|
||||||
|
@@ -455,6 +456,8 @@ _meson_option_parse() {
|
||||||
|
--disable-qpl) printf "%s" -Dqpl=disabled ;;
|
||||||
|
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||||
|
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||||
|
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||||
|
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||||
|
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||||
|
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||||
|
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
|
@ -0,0 +1,172 @@
|
||||||
|
diff --git a/block/meson.build b/block/meson.build
|
||||||
|
index f1262ec2ba..3cf3e23f16 100644
|
||||||
|
--- a/block/meson.build
|
||||||
|
+++ b/block/meson.build
|
||||||
|
@@ -114,6 +114,7 @@ foreach m : [
|
||||||
|
[libnfs, 'nfs', files('nfs.c')],
|
||||||
|
[libssh, 'ssh', files('ssh.c')],
|
||||||
|
[rbd, 'rbd', files('rbd.c')],
|
||||||
|
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||||
|
]
|
||||||
|
if m[0].found()
|
||||||
|
module_ss = ss.source_set()
|
||||||
|
diff --git a/meson.build b/meson.build
|
||||||
|
index 147097c652..2486b3aeb5 100644
|
||||||
|
--- a/meson.build
|
||||||
|
+++ b/meson.build
|
||||||
|
@@ -1590,6 +1590,26 @@ if not get_option('rbd').auto() or have_block
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
+vitastor = not_found
|
||||||
|
+if not get_option('vitastor').auto() or have_block
|
||||||
|
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||||
|
+ required: get_option('vitastor'))
|
||||||
|
+ if libvitastor_client.found()
|
||||||
|
+ if cc.links('''
|
||||||
|
+ #include <vitastor_c.h>
|
||||||
|
+ int main(void) {
|
||||||
|
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
+ return 0;
|
||||||
|
+ }''', dependencies: libvitastor_client)
|
||||||
|
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||||
|
+ elif get_option('vitastor').enabled()
|
||||||
|
+ error('could not link libvitastor_client')
|
||||||
|
+ else
|
||||||
|
+ warning('could not link libvitastor_client, disabling')
|
||||||
|
+ endif
|
||||||
|
+ endif
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
glusterfs = not_found
|
||||||
|
glusterfs_ftruncate_has_stat = false
|
||||||
|
glusterfs_iocb_has_stat = false
|
||||||
|
@@ -2474,6 +2494,7 @@ endif
|
||||||
|
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||||
|
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||||
|
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||||
|
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||||
|
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||||
|
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||||
|
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||||
|
@@ -4778,6 +4799,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
|
||||||
|
summary_info += {'libcap-ng support': libcap_ng}
|
||||||
|
summary_info += {'bpf support': libbpf}
|
||||||
|
summary_info += {'rbd support': rbd}
|
||||||
|
+summary_info += {'vitastor support': vitastor}
|
||||||
|
summary_info += {'smartcard support': cacard}
|
||||||
|
summary_info += {'U2F support': u2f}
|
||||||
|
summary_info += {'libusb': libusb}
|
||||||
|
diff --git a/meson_options.txt b/meson_options.txt
|
||||||
|
index 5eeaf3eee5..b04eda29f9 100644
|
||||||
|
--- a/meson_options.txt
|
||||||
|
+++ b/meson_options.txt
|
||||||
|
@@ -200,6 +200,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||||
|
description: 'lzo compression support')
|
||||||
|
option('rbd', type : 'feature', value : 'auto',
|
||||||
|
description: 'Ceph block device driver')
|
||||||
|
+option('vitastor', type : 'feature', value : 'auto',
|
||||||
|
+ description: 'Vitastor block device driver')
|
||||||
|
option('opengl', type : 'feature', value : 'auto',
|
||||||
|
description: 'OpenGL support')
|
||||||
|
option('rdma', type : 'feature', value : 'auto',
|
||||||
|
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||||
|
index fd3bcc1c17..41571ac3f9 100644
|
||||||
|
--- a/qapi/block-core.json
|
||||||
|
+++ b/qapi/block-core.json
|
||||||
|
@@ -3212,7 +3212,7 @@
|
||||||
|
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||||
|
'raw', 'rbd',
|
||||||
|
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||||
|
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||||
|
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||||
|
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -4295,6 +4295,28 @@
|
||||||
|
'*key-secret': 'str',
|
||||||
|
'*server': ['InetSocketAddressBase'] } }
|
||||||
|
|
||||||
|
+##
|
||||||
|
+# @BlockdevOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific block device options for vitastor
|
||||||
|
+#
|
||||||
|
+# @image: Image name
|
||||||
|
+# @inode: Inode number
|
||||||
|
+# @pool: Pool ID
|
||||||
|
+# @size: Desired image size in bytes
|
||||||
|
+# @config-path: Path to Vitastor configuration
|
||||||
|
+# @etcd-host: etcd connection address(es)
|
||||||
|
+# @etcd-prefix: etcd key/value prefix
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'data': { '*inode': 'uint64',
|
||||||
|
+ '*pool': 'uint64',
|
||||||
|
+ '*size': 'uint64',
|
||||||
|
+ '*image': 'str',
|
||||||
|
+ '*config-path': 'str',
|
||||||
|
+ '*etcd-host': 'str',
|
||||||
|
+ '*etcd-prefix': 'str' } }
|
||||||
|
+
|
||||||
|
##
|
||||||
|
# @ReplicationMode:
|
||||||
|
#
|
||||||
|
@@ -4757,6 +4779,7 @@
|
||||||
|
'throttle': 'BlockdevOptionsThrottle',
|
||||||
|
'vdi': 'BlockdevOptionsGenericFormat',
|
||||||
|
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||||
|
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||||
|
'virtio-blk-vfio-pci':
|
||||||
|
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||||
|
'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -5198,6 +5221,20 @@
|
||||||
|
'*cluster-size' : 'size',
|
||||||
|
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||||
|
|
||||||
|
+##
|
||||||
|
+# @BlockdevCreateOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific image creation options for Vitastor.
|
||||||
|
+#
|
||||||
|
+# @location: Where to store the new image file. This location cannot
|
||||||
|
+# point to a snapshot.
|
||||||
|
+#
|
||||||
|
+# @size: Size of the virtual disk in bytes
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||||
|
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'size': 'size' } }
|
||||||
|
+
|
||||||
|
##
|
||||||
|
# @BlockdevVmdkSubformat:
|
||||||
|
#
|
||||||
|
@@ -5420,6 +5457,7 @@
|
||||||
|
'ssh': 'BlockdevCreateOptionsSsh',
|
||||||
|
'vdi': 'BlockdevCreateOptionsVdi',
|
||||||
|
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||||
|
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||||
|
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||||
|
'vpc': 'BlockdevCreateOptionsVpc'
|
||||||
|
} }
|
||||||
|
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||||
|
index a8066aab03..12e650e7d4 100644
|
||||||
|
--- a/scripts/meson-buildoptions.sh
|
||||||
|
+++ b/scripts/meson-buildoptions.sh
|
||||||
|
@@ -174,6 +174,7 @@ meson_options_help() {
|
||||||
|
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||||
|
printf "%s\n" ' qpl Query Processing Library support'
|
||||||
|
printf "%s\n" ' rbd Ceph block device driver'
|
||||||
|
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||||
|
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||||
|
printf "%s\n" ' replication replication support'
|
||||||
|
printf "%s\n" ' rust Rust support'
|
||||||
|
@@ -455,6 +456,8 @@ _meson_option_parse() {
|
||||||
|
--disable-qpl) printf "%s" -Dqpl=disabled ;;
|
||||||
|
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||||
|
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||||
|
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||||
|
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||||
|
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||||
|
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||||
|
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.11.0
|
Version: 2.1.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.11.0.el7.tar.gz
|
Source0: vitastor-2.1.0.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.11.0
|
Version: 2.1.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.11.0.el8.tar.gz
|
Source0: vitastor-2.1.0.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.11.0
|
Version: 2.1.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.11.0.el9.tar.gz
|
Source0: vitastor-2.1.0.el9.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVITASTOR_VERSION="1.11.0")
|
add_definitions(-DVITASTOR_VERSION="2.1.0")
|
||||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
add_link_options(-fno-omit-frame-pointer)
|
add_link_options(-fno-omit-frame-pointer)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include "blockstore_impl.h"
|
#include "blockstore_impl.h"
|
||||||
#include "blockstore_disk.h"
|
#include "blockstore_disk.h"
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
#include "allocator.h"
|
||||||
|
|
||||||
static uint32_t is_power_of_two(uint64_t value)
|
static uint32_t is_power_of_two(uint64_t value)
|
||||||
{
|
{
|
||||||
|
@ -83,6 +84,12 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||||
throw std::runtime_error("data_csum_type="+config["data_csum_type"]+" is unsupported, only \"crc32c\" and \"none\" are supported");
|
throw std::runtime_error("data_csum_type="+config["data_csum_type"]+" is unsupported, only \"crc32c\" and \"none\" are supported");
|
||||||
}
|
}
|
||||||
csum_block_size = parse_size(config["csum_block_size"]);
|
csum_block_size = parse_size(config["csum_block_size"]);
|
||||||
|
discard_on_start = config.find("discard_on_start") != config.end() &&
|
||||||
|
(config["discard_on_start"] == "true" || config["discard_on_start"] == "1" || config["discard_on_start"] == "yes");
|
||||||
|
min_discard_size = parse_size(config["min_discard_size"]);
|
||||||
|
if (!min_discard_size)
|
||||||
|
min_discard_size = 1024*1024;
|
||||||
|
discard_granularity = parse_size(config["discard_granularity"]);
|
||||||
// Validate
|
// Validate
|
||||||
if (!data_block_size)
|
if (!data_block_size)
|
||||||
{
|
{
|
||||||
|
@ -172,10 +179,6 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||||
{
|
{
|
||||||
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
|
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
|
||||||
}
|
}
|
||||||
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
|
|
||||||
clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
|
|
||||||
? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
|
|
||||||
clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||||
|
@ -224,9 +227,13 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||||
}
|
}
|
||||||
// required metadata size
|
// required metadata size
|
||||||
block_count = data_len / data_block_size;
|
block_count = data_len / data_block_size;
|
||||||
|
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
|
||||||
|
clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
|
||||||
|
? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
|
||||||
|
clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
|
||||||
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||||
if (meta_format == BLOCKSTORE_META_FORMAT_V1 ||
|
bool new_doesnt_fit = (!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type);
|
||||||
!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type)
|
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || new_doesnt_fit)
|
||||||
{
|
{
|
||||||
uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
|
uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
|
||||||
uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
|
uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
|
||||||
|
@ -234,7 +241,11 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||||
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
|
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
|
||||||
{
|
{
|
||||||
// Old metadata fits.
|
// Old metadata fits.
|
||||||
printf("Warning: Using old metadata format without checksums because the new format doesn't fit into provided area\n");
|
if (new_doesnt_fit)
|
||||||
|
{
|
||||||
|
printf("Warning: Using old metadata format without checksums because the new format"
|
||||||
|
" doesn't fit into provided area (%ju bytes required, %ju bytes available)\n", meta_len, meta_area_size);
|
||||||
|
}
|
||||||
clean_entry_size = clean_entry_v0_size;
|
clean_entry_size = clean_entry_v0_size;
|
||||||
meta_len = meta_v0_len;
|
meta_len = meta_v0_len;
|
||||||
meta_format = BLOCKSTORE_META_FORMAT_V1;
|
meta_format = BLOCKSTORE_META_FORMAT_V1;
|
||||||
|
@ -246,7 +257,7 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||||
meta_format = BLOCKSTORE_META_FORMAT_V2;
|
meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||||
if (!skip_meta_check && meta_area_size < meta_len)
|
if (!skip_meta_check && meta_area_size < meta_len)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
|
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
|
||||||
}
|
}
|
||||||
// requested journal size
|
// requested journal size
|
||||||
if (!skip_meta_check && cfg_journal_size > journal_len)
|
if (!skip_meta_check && cfg_journal_size > journal_len)
|
||||||
|
@ -415,3 +426,44 @@ void blockstore_disk_t::close_all()
|
||||||
close(journal_fd);
|
close(journal_fd);
|
||||||
data_fd = meta_fd = journal_fd = -1;
|
data_fd = meta_fd = journal_fd = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sadly DISCARD only works through ioctl(), but it seems to always block the device queue,
|
||||||
|
// so it's not a big deal that we can only run it synchronously.
|
||||||
|
int blockstore_disk_t::trim_data(allocator_t *alloc)
|
||||||
|
{
|
||||||
|
int r = 0;
|
||||||
|
uint64_t j = 0, i = 0;
|
||||||
|
uint64_t discarded = 0;
|
||||||
|
for (; i <= block_count; i++)
|
||||||
|
{
|
||||||
|
if (i >= block_count || alloc->get(i))
|
||||||
|
{
|
||||||
|
if (i > j && (i-j)*data_block_size >= min_discard_size)
|
||||||
|
{
|
||||||
|
uint64_t range[2] = { data_offset + j*data_block_size, (i-j)*data_block_size };
|
||||||
|
if (discard_granularity)
|
||||||
|
{
|
||||||
|
range[1] += range[0];
|
||||||
|
if (range[1] % discard_granularity)
|
||||||
|
range[1] = range[1] - (range[1] % discard_granularity);
|
||||||
|
if (range[0] % discard_granularity)
|
||||||
|
range[0] = range[0] + discard_granularity - (range[0] % discard_granularity);
|
||||||
|
if (range[0] >= range[1])
|
||||||
|
continue;
|
||||||
|
range[1] -= range[0];
|
||||||
|
}
|
||||||
|
r = ioctl(data_fd, BLKDISCARD, &range);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to execute BLKDISCARD %ju+%ju on %s: %s (code %d)\n",
|
||||||
|
range[0], range[1], data_device.c_str(), strerror(-r), r);
|
||||||
|
return -errno;
|
||||||
|
}
|
||||||
|
discarded += range[1];
|
||||||
|
}
|
||||||
|
j = i+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s (%ju bytes) of unused data discarded on %s\n", format_size(discarded).c_str(), discarded, data_device.c_str());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -12,6 +12,8 @@
|
||||||
// Lower byte of checksum type is its length
|
// Lower byte of checksum type is its length
|
||||||
#define BLOCKSTORE_CSUM_CRC32C 0x104
|
#define BLOCKSTORE_CSUM_CRC32C 0x104
|
||||||
|
|
||||||
|
class allocator_t;
|
||||||
|
|
||||||
struct blockstore_disk_t
|
struct blockstore_disk_t
|
||||||
{
|
{
|
||||||
std::string data_device, meta_device, journal_device;
|
std::string data_device, meta_device, journal_device;
|
||||||
|
@ -34,14 +36,18 @@ struct blockstore_disk_t
|
||||||
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
|
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
|
||||||
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
|
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
|
||||||
std::string data_io, meta_io, journal_io;
|
std::string data_io, meta_io, journal_io;
|
||||||
|
// Data discard granularity and minimum size (for the sake of performance)
|
||||||
|
bool discard_on_start = false;
|
||||||
|
uint64_t min_discard_size = 1024*1024;
|
||||||
|
uint64_t discard_granularity = 0;
|
||||||
|
|
||||||
int meta_fd = -1, data_fd = -1, journal_fd = -1;
|
int meta_fd = -1, data_fd = -1, journal_fd = -1;
|
||||||
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
|
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
|
||||||
uint64_t data_offset, data_device_sect, data_device_size, data_len;
|
uint64_t data_offset, data_device_sect, data_device_size, data_len;
|
||||||
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
|
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
|
||||||
|
|
||||||
uint32_t block_order;
|
uint32_t block_order = 0;
|
||||||
uint64_t block_count;
|
uint64_t block_count = 0;
|
||||||
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
|
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
|
||||||
|
|
||||||
void parse_config(std::map<std::string, std::string> & config);
|
void parse_config(std::map<std::string, std::string> & config);
|
||||||
|
@ -50,6 +56,7 @@ struct blockstore_disk_t
|
||||||
void open_journal();
|
void open_journal();
|
||||||
void calc_lengths(bool skip_meta_check = false);
|
void calc_lengths(bool skip_meta_check = false);
|
||||||
void close_all();
|
void close_all();
|
||||||
|
int trim_data(allocator_t *alloc);
|
||||||
|
|
||||||
inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
|
inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
|
||||||
{
|
{
|
||||||
|
|
|
@ -12,15 +12,15 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
||||||
ringloop->register_consumer(&ring_consumer);
|
ringloop->register_consumer(&ring_consumer);
|
||||||
initialized = 0;
|
initialized = 0;
|
||||||
parse_config(config, true);
|
parse_config(config, true);
|
||||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
|
||||||
alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
calc_lengths();
|
calc_lengths();
|
||||||
data_alloc = new allocator(dsk.block_count);
|
alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
|
||||||
|
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
||||||
|
data_alloc = new allocator_t(dsk.block_count);
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
|
@ -34,7 +34,8 @@ blockstore_impl_t::~blockstore_impl_t()
|
||||||
{
|
{
|
||||||
delete data_alloc;
|
delete data_alloc;
|
||||||
delete flusher;
|
delete flusher;
|
||||||
free(zero_object);
|
if (zero_object)
|
||||||
|
free(zero_object);
|
||||||
ringloop->unregister_consumer(&ring_consumer);
|
ringloop->unregister_consumer(&ring_consumer);
|
||||||
dsk.close_all();
|
dsk.close_all();
|
||||||
if (metadata_buffer)
|
if (metadata_buffer)
|
||||||
|
@ -83,14 +84,20 @@ void blockstore_impl_t::loop()
|
||||||
{
|
{
|
||||||
delete journal_init_reader;
|
delete journal_init_reader;
|
||||||
journal_init_reader = NULL;
|
journal_init_reader = NULL;
|
||||||
if (journal.flush_journal)
|
initialized = 3;
|
||||||
initialized = 3;
|
|
||||||
else
|
|
||||||
initialized = 10;
|
|
||||||
ringloop->wakeup();
|
ringloop->wakeup();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (initialized == 3)
|
if (initialized == 3)
|
||||||
|
{
|
||||||
|
if (!readonly && dsk.discard_on_start)
|
||||||
|
dsk.trim_data(data_alloc);
|
||||||
|
if (journal.flush_journal)
|
||||||
|
initialized = 4;
|
||||||
|
else
|
||||||
|
initialized = 10;
|
||||||
|
}
|
||||||
|
if (initialized == 4)
|
||||||
{
|
{
|
||||||
if (readonly)
|
if (readonly)
|
||||||
{
|
{
|
||||||
|
|
|
@ -279,9 +279,9 @@ class blockstore_impl_t
|
||||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||||
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
||||||
int unsynced_queued_ops = 0;
|
int unsynced_queued_ops = 0;
|
||||||
allocator *data_alloc = NULL;
|
allocator_t *data_alloc = NULL;
|
||||||
uint64_t used_blocks = 0;
|
uint64_t used_blocks = 0;
|
||||||
uint8_t *zero_object;
|
uint8_t *zero_object = NULL;
|
||||||
|
|
||||||
void *metadata_buffer = NULL;
|
void *metadata_buffer = NULL;
|
||||||
|
|
||||||
|
|
|
@ -138,7 +138,11 @@ resume_1:
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
hdr->header_csum = csum;
|
hdr->header_csum = csum;
|
||||||
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
|
if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V2)
|
||||||
|
{
|
||||||
|
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||||
|
bs->dsk.calc_lengths();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (hdr->version == BLOCKSTORE_META_FORMAT_V1)
|
else if (hdr->version == BLOCKSTORE_META_FORMAT_V1)
|
||||||
{
|
{
|
||||||
|
@ -146,11 +150,15 @@ resume_1:
|
||||||
hdr->csum_block_size = 0;
|
hdr->csum_block_size = 0;
|
||||||
hdr->header_csum = 0;
|
hdr->header_csum = 0;
|
||||||
// Enable compatibility mode - entries without checksums
|
// Enable compatibility mode - entries without checksums
|
||||||
bs->dsk.clean_entry_size = sizeof(clean_disk_entry) + bs->dsk.clean_entry_bitmap_size*2;
|
if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V1 ||
|
||||||
bs->dsk.meta_len = (1 + (bs->dsk.block_count - 1 + bs->dsk.meta_block_size / bs->dsk.clean_entry_size)
|
bs->dsk.data_csum_type != 0 || bs->dsk.csum_block_size != 0)
|
||||||
/ (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
|
{
|
||||||
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V1;
|
bs->dsk.data_csum_type = 0;
|
||||||
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
|
bs->dsk.csum_block_size = 0;
|
||||||
|
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V1;
|
||||||
|
bs->dsk.calc_lengths();
|
||||||
|
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
|
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
|
||||||
{
|
{
|
||||||
|
@ -338,7 +346,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||||
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
|
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
|
||||||
if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
|
if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
|
||||||
{
|
{
|
||||||
printf("Metadata entry %ju is corrupt (checksum mismatch), skipping\n", done_cnt+i);
|
printf("Metadata entry %ju is corrupt (checksum mismatch: %08x vs %08x), skipping\n", done_cnt+i, *entry_csum, crc32c(0, entry, bs->dsk.clean_entry_size - 4));
|
||||||
// zero out the invalid entry, otherwise we'll hit "tried to overwrite non-zero metadata entry" later
|
// zero out the invalid entry, otherwise we'll hit "tried to overwrite non-zero metadata entry" later
|
||||||
if (bs->inmemory_meta)
|
if (bs->inmemory_meta)
|
||||||
{
|
{
|
||||||
|
|
|
@ -7,10 +7,14 @@ set(MSGR_RDMA "")
|
||||||
if (IBVERBS_LIBRARIES)
|
if (IBVERBS_LIBRARIES)
|
||||||
set(MSGR_RDMA "msgr_rdma.cpp")
|
set(MSGR_RDMA "msgr_rdma.cpp")
|
||||||
endif (IBVERBS_LIBRARIES)
|
endif (IBVERBS_LIBRARIES)
|
||||||
|
set(MSGR_RDMACM "")
|
||||||
|
if (RDMACM_LIBRARIES)
|
||||||
|
set(MSGR_RDMACM "msgr_rdmacm.cpp")
|
||||||
|
endif (RDMACM_LIBRARIES)
|
||||||
add_library(vitastor_common STATIC
|
add_library(vitastor_common STATIC
|
||||||
../util/epoll_manager.cpp etcd_state_client.cpp messenger.cpp ../util/addr_util.cpp
|
../util/epoll_manager.cpp etcd_state_client.cpp messenger.cpp ../util/addr_util.cpp
|
||||||
msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ../util/ringloop.cpp ../../json11/json11.cpp
|
msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ../util/ringloop.cpp ../../json11/json11.cpp
|
||||||
http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../util/json_util.cpp ${MSGR_RDMA}
|
http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../util/json_util.cpp ${MSGR_RDMA} ${MSGR_RDMACM}
|
||||||
)
|
)
|
||||||
target_link_libraries(vitastor_common pthread)
|
target_link_libraries(vitastor_common pthread)
|
||||||
target_compile_options(vitastor_common PUBLIC -fPIC)
|
target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||||
|
@ -28,6 +32,7 @@ target_link_libraries(vitastor_client
|
||||||
vitastor_cli
|
vitastor_cli
|
||||||
${LIBURING_LIBRARIES}
|
${LIBURING_LIBRARIES}
|
||||||
${IBVERBS_LIBRARIES}
|
${IBVERBS_LIBRARIES}
|
||||||
|
${RDMACM_LIBRARIES}
|
||||||
)
|
)
|
||||||
set_target_properties(vitastor_client PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
|
set_target_properties(vitastor_client PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
|
||||||
configure_file(vitastor.pc.in vitastor.pc @ONLY)
|
configure_file(vitastor.pc.in vitastor.pc @ONLY)
|
||||||
|
|
|
@ -29,8 +29,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||||
if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
|
if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
// peer_osd just connected
|
// peer_osd just connected
|
||||||
// retry operations waiting for connection immediately
|
continue_ops();
|
||||||
continue_ops(client_retry_interval);
|
|
||||||
continue_lists();
|
continue_lists();
|
||||||
continue_raw_ops(peer_osd);
|
continue_raw_ops(peer_osd);
|
||||||
}
|
}
|
||||||
|
|
|
@ -83,6 +83,9 @@ class writeback_cache_t;
|
||||||
// FIXME: Split into public and private interfaces
|
// FIXME: Split into public and private interfaces
|
||||||
class cluster_client_t
|
class cluster_client_t
|
||||||
{
|
{
|
||||||
|
#ifdef __MOCK__
|
||||||
|
public:
|
||||||
|
#endif
|
||||||
timerfd_manager_t *tfd;
|
timerfd_manager_t *tfd;
|
||||||
ring_loop_t *ringloop;
|
ring_loop_t *ringloop;
|
||||||
|
|
||||||
|
@ -144,8 +147,6 @@ public:
|
||||||
|
|
||||||
bool get_immediate_commit(uint64_t inode);
|
bool get_immediate_commit(uint64_t inode);
|
||||||
|
|
||||||
void continue_ops(int time_passed = 0);
|
|
||||||
|
|
||||||
void list_inode(inode_t inode, uint64_t min_offset, uint64_t max_offset, int max_parallel_pgs, std::function<void(
|
void list_inode(inode_t inode, uint64_t min_offset, uint64_t max_offset, int max_parallel_pgs, std::function<void(
|
||||||
int status, int pgs_left, pg_num_t pg_num, std::set<object_id>&& objects)> pg_callback);
|
int status, int pgs_left, pg_num_t pg_num, std::set<object_id>&& objects)> pg_callback);
|
||||||
|
|
||||||
|
@ -153,6 +154,11 @@ public:
|
||||||
//inline uint64_t get_bs_block_size() { return st_cli.global_block_size; }
|
//inline uint64_t get_bs_block_size() { return st_cli.global_block_size; }
|
||||||
uint64_t next_op_id();
|
uint64_t next_op_id();
|
||||||
|
|
||||||
|
#ifndef __MOCK__
|
||||||
|
protected:
|
||||||
|
#endif
|
||||||
|
void continue_ops(int time_passed = 0);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
||||||
bool affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num);
|
bool affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num);
|
||||||
|
|
|
@ -52,6 +52,7 @@ public:
|
||||||
bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
|
bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
|
||||||
void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
|
void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
|
||||||
void mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
|
void mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
|
||||||
|
void delete_flush(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
|
||||||
void fsync_start();
|
void fsync_start();
|
||||||
void fsync_error();
|
void fsync_error();
|
||||||
void fsync_ok();
|
void fsync_ok();
|
||||||
|
|
|
@ -9,7 +9,7 @@ writeback_cache_t::~writeback_cache_t()
|
||||||
{
|
{
|
||||||
for (auto & bp: dirty_buffers)
|
for (auto & bp: dirty_buffers)
|
||||||
{
|
{
|
||||||
if (!--(*bp.second.refcnt))
|
if (bp.second.buf && !--(*bp.second.refcnt))
|
||||||
{
|
{
|
||||||
free(bp.second.refcnt); // refcnt is allocated with the buffer
|
free(bp.second.refcnt); // refcnt is allocated with the buffer
|
||||||
}
|
}
|
||||||
|
@ -115,7 +115,10 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||||
.flush_id = dirty_it->second.flush_id,
|
.flush_id = dirty_it->second.flush_id,
|
||||||
.refcnt = dirty_it->second.refcnt,
|
.refcnt = dirty_it->second.refcnt,
|
||||||
});
|
});
|
||||||
(*dirty_it->second.refcnt)++;
|
if (dirty_it->second.buf)
|
||||||
|
{
|
||||||
|
(*dirty_it->second.refcnt)++;
|
||||||
|
}
|
||||||
if (dirty_it->second.state == CACHE_DIRTY)
|
if (dirty_it->second.state == CACHE_DIRTY)
|
||||||
{
|
{
|
||||||
if (dirty_it->second.buf)
|
if (dirty_it->second.buf)
|
||||||
|
@ -193,7 +196,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||||
writeback_queue_size++;
|
writeback_queue_size++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!--(*dirty_it->second.refcnt))
|
if (dirty_it->second.buf && !--(*dirty_it->second.refcnt))
|
||||||
{
|
{
|
||||||
free(dirty_it->second.refcnt);
|
free(dirty_it->second.refcnt);
|
||||||
}
|
}
|
||||||
|
@ -204,7 +207,10 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||||
bool is_del = op->opcode == OSD_OP_DELETE;
|
bool is_del = op->opcode == OSD_OP_DELETE;
|
||||||
uint64_t *refcnt = is_del ? NULL : (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
|
uint64_t *refcnt = is_del ? NULL : (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
|
||||||
uint8_t *buf = is_del ? NULL : ((uint8_t*)refcnt + sizeof(uint64_t));
|
uint8_t *buf = is_del ? NULL : ((uint8_t*)refcnt + sizeof(uint64_t));
|
||||||
*refcnt = 1;
|
if (!is_del)
|
||||||
|
{
|
||||||
|
*refcnt = 1;
|
||||||
|
}
|
||||||
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
|
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
|
||||||
.inode = op->inode,
|
.inode = op->inode,
|
||||||
.stripe = op->offset,
|
.stripe = op->offset,
|
||||||
|
@ -326,7 +332,14 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||||
}
|
}
|
||||||
flushed_buffers.erase(fl_it++);
|
flushed_buffers.erase(fl_it++);
|
||||||
}
|
}
|
||||||
mark_flush_written(op->inode, op->offset, op->len, flush_id);
|
if (op->flags & OP_IMMEDIATE_COMMIT)
|
||||||
|
{
|
||||||
|
delete_flush(op->inode, op->offset, op->len, flush_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
mark_flush_written(op->inode, op->offset, op->len, flush_id);
|
||||||
|
}
|
||||||
delete op;
|
delete op;
|
||||||
writebacks_active--;
|
writebacks_active--;
|
||||||
// We can't call execute_internal because it affects an invalid copy of the list here
|
// We can't call execute_internal because it affects an invalid copy of the list here
|
||||||
|
@ -344,6 +357,25 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void writeback_cache_t::delete_flush(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
|
||||||
|
{
|
||||||
|
for (auto dirty_it = find_dirty(inode, offset);
|
||||||
|
dirty_it != dirty_buffers.end() && dirty_it->first.inode == inode &&
|
||||||
|
dirty_it->first.stripe < offset+len; )
|
||||||
|
{
|
||||||
|
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
|
||||||
|
{
|
||||||
|
if (dirty_it->second.buf && !--(*dirty_it->second.refcnt))
|
||||||
|
{
|
||||||
|
free(dirty_it->second.refcnt);
|
||||||
|
}
|
||||||
|
dirty_buffers.erase(dirty_it++);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
dirty_it++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void writeback_cache_t::mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
|
void writeback_cache_t::mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
|
||||||
{
|
{
|
||||||
for (auto dirty_it = find_dirty(inode, offset);
|
for (auto dirty_it = find_dirty(inode, offset);
|
||||||
|
@ -560,8 +592,10 @@ void writeback_cache_t::fsync_ok()
|
||||||
{
|
{
|
||||||
if (uw_it->second.state == CACHE_FLUSHING)
|
if (uw_it->second.state == CACHE_FLUSHING)
|
||||||
{
|
{
|
||||||
if (!--(*uw_it->second.refcnt))
|
if (uw_it->second.buf && !--(*uw_it->second.refcnt))
|
||||||
|
{
|
||||||
free(uw_it->second.refcnt);
|
free(uw_it->second.refcnt);
|
||||||
|
}
|
||||||
dirty_buffers.erase(uw_it++);
|
dirty_buffers.erase(uw_it++);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -31,6 +31,11 @@ etcd_state_client_t::~etcd_state_client_t()
|
||||||
keepalive_client = NULL;
|
keepalive_client = NULL;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
if (load_pgs_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(load_pgs_timer_id);
|
||||||
|
load_pgs_timer_id = -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef __MOCK__
|
#ifndef __MOCK__
|
||||||
|
@ -143,6 +148,7 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
|
||||||
}
|
}
|
||||||
if (interval > 0)
|
if (interval > 0)
|
||||||
{
|
{
|
||||||
|
// FIXME: Prevent destruction of etcd_state_client if timers or requests are active
|
||||||
tfd->set_timer(interval, false, [this, api, payload, timeout, retries, interval, callback](int)
|
tfd->set_timer(interval, false, [this, api, payload, timeout, retries, interval, callback](int)
|
||||||
{
|
{
|
||||||
etcd_call(api, payload, timeout, retries, interval, callback);
|
etcd_call(api, payload, timeout, retries, interval, callback);
|
||||||
|
@ -176,7 +182,7 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (!local_ips.size())
|
if (!local_ips.size())
|
||||||
local_ips = getifaddr_list(std::vector<std::string>(), true);
|
local_ips = getifaddr_list(std::vector<addr_mask_t>(), true);
|
||||||
std::string check_addr;
|
std::string check_addr;
|
||||||
int pos = addr.find('/');
|
int pos = addr.find('/');
|
||||||
int pos2 = addr.find(':');
|
int pos2 = addr.find(':');
|
||||||
|
@ -271,6 +277,11 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
this->etcd_quick_timeout = 1000;
|
this->etcd_quick_timeout = 1000;
|
||||||
}
|
}
|
||||||
|
this->etcd_min_reload_interval = config["etcd_min_reload_interval"].uint64_value();
|
||||||
|
if (this->etcd_min_reload_interval <= 0)
|
||||||
|
{
|
||||||
|
this->etcd_min_reload_interval = 50;
|
||||||
|
}
|
||||||
if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
|
if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
|
||||||
{
|
{
|
||||||
#ifndef __MOCK__
|
#ifndef __MOCK__
|
||||||
|
@ -603,6 +614,23 @@ void etcd_state_client_t::load_global_config()
|
||||||
|
|
||||||
void etcd_state_client_t::load_pgs()
|
void etcd_state_client_t::load_pgs()
|
||||||
{
|
{
|
||||||
|
timespec tv;
|
||||||
|
clock_gettime(CLOCK_REALTIME, &tv);
|
||||||
|
uint64_t ms_passed = (tv.tv_sec-etcd_last_reload.tv_sec)*1000 + (tv.tv_nsec-etcd_last_reload.tv_nsec)/1000000;
|
||||||
|
if (ms_passed < etcd_min_reload_interval)
|
||||||
|
{
|
||||||
|
if (load_pgs_timer_id < 0)
|
||||||
|
{
|
||||||
|
load_pgs_timer_id = tfd->set_timer(etcd_min_reload_interval+50-ms_passed, false, [this](int) { load_pgs(); });
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
etcd_last_reload = tv;
|
||||||
|
if (load_pgs_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(load_pgs_timer_id);
|
||||||
|
load_pgs_timer_id = -1;
|
||||||
|
}
|
||||||
json11::Json::array txn = {
|
json11::Json::array txn = {
|
||||||
json11::Json::object {
|
json11::Json::object {
|
||||||
{ "request_range", json11::Json::object {
|
{ "request_range", json11::Json::object {
|
||||||
|
@ -889,7 +917,11 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
if (!pc.scrub_interval)
|
if (!pc.scrub_interval)
|
||||||
pc.scrub_interval = 0;
|
pc.scrub_interval = 0;
|
||||||
// Mark pool as VitastorFS pool (disable per-inode stats and block volume creation)
|
// Mark pool as VitastorFS pool (disable per-inode stats and block volume creation)
|
||||||
pc.used_for_fs = pool_item.second["used_for_fs"].as_string();
|
pc.used_for_app = pool_item.second["used_for_fs"].as_string();
|
||||||
|
if (pc.used_for_app != "")
|
||||||
|
pc.used_for_app = "fs:"+pc.used_for_app;
|
||||||
|
else
|
||||||
|
pc.used_for_app = pool_item.second["used_for_app"].as_string();
|
||||||
// Immediate Commit Mode
|
// Immediate Commit Mode
|
||||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||||
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
||||||
|
|
|
@ -61,7 +61,7 @@ struct pool_config_t
|
||||||
uint64_t pg_stripe_size;
|
uint64_t pg_stripe_size;
|
||||||
std::map<pg_num_t, pg_config_t> pg_config;
|
std::map<pg_num_t, pg_config_t> pg_config;
|
||||||
uint64_t scrub_interval;
|
uint64_t scrub_interval;
|
||||||
std::string used_for_fs;
|
std::string used_for_app;
|
||||||
int backfillfull;
|
int backfillfull;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -108,6 +108,7 @@ public:
|
||||||
int max_etcd_attempts = 5;
|
int max_etcd_attempts = 5;
|
||||||
int etcd_quick_timeout = 1000;
|
int etcd_quick_timeout = 1000;
|
||||||
int etcd_slow_timeout = 5000;
|
int etcd_slow_timeout = 5000;
|
||||||
|
int etcd_min_reload_interval = 1000;
|
||||||
bool infinite_start = true;
|
bool infinite_start = true;
|
||||||
uint64_t global_block_size = DEFAULT_BLOCK_SIZE;
|
uint64_t global_block_size = DEFAULT_BLOCK_SIZE;
|
||||||
uint32_t global_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
uint32_t global_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||||
|
@ -122,6 +123,8 @@ public:
|
||||||
uint64_t etcd_watch_revision_config = 0;
|
uint64_t etcd_watch_revision_config = 0;
|
||||||
uint64_t etcd_watch_revision_osd = 0;
|
uint64_t etcd_watch_revision_osd = 0;
|
||||||
uint64_t etcd_watch_revision_pg = 0;
|
uint64_t etcd_watch_revision_pg = 0;
|
||||||
|
timespec etcd_last_reload = {};
|
||||||
|
int load_pgs_timer_id = -1;
|
||||||
std::map<pool_id_t, pool_config_t> pool_config;
|
std::map<pool_id_t, pool_config_t> pool_config;
|
||||||
std::map<osd_num_t, json11::Json> peer_states;
|
std::map<osd_num_t, json11::Json> peer_states;
|
||||||
std::set<osd_num_t> seen_peers;
|
std::set<osd_num_t> seen_peers;
|
||||||
|
|
|
@ -377,7 +377,7 @@ static void io_callback(void *opaque, long retval)
|
||||||
bsd->completed.push_back(io);
|
bsd->completed.push_back(io);
|
||||||
if (bsd->trace)
|
if (bsd->trace)
|
||||||
{
|
{
|
||||||
printf("--- %s 0x%jx retval=%ld\n", io->ddir == DDIR_READ ? "READ" :
|
printf("--- %s 0x%jx retval=%jd\n", io->ddir == DDIR_READ ? "READ" :
|
||||||
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), (uint64_t)io, retval);
|
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), (uint64_t)io, retval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -117,29 +117,53 @@ void msgr_iothread_t::run()
|
||||||
|
|
||||||
void osd_messenger_t::init()
|
void osd_messenger_t::init()
|
||||||
{
|
{
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
if (use_rdmacm)
|
||||||
|
{
|
||||||
|
// RDMA-CM only requires the event channel. All the remaining work is done separately
|
||||||
|
rdmacm_evch = rdma_create_event_channel();
|
||||||
|
if (!rdmacm_evch)
|
||||||
|
{
|
||||||
|
// ENODEV means that the client doesn't have RDMA devices available
|
||||||
|
if (errno != ENODEV || log_level > 0)
|
||||||
|
fprintf(stderr, "Failed to initialize RDMA-CM event channel: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fcntl(rdmacm_evch->fd, F_SETFL, fcntl(rdmacm_evch->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
|
tfd->set_fd_handler(rdmacm_evch->fd, false, [this](int rdmacm_eventfd, int epoll_events)
|
||||||
|
{
|
||||||
|
handle_rdmacm_events();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
if (use_rdma)
|
if (use_rdma)
|
||||||
{
|
{
|
||||||
rdma_context = msgr_rdma_context_t::create(
|
rdma_contexts = msgr_rdma_context_t::create_all(
|
||||||
osd_networks, rdma_device != "" ? rdma_device.c_str() : NULL,
|
osd_num && osd_cluster_network_masks.size() ? osd_cluster_network_masks : osd_network_masks,
|
||||||
|
rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||||
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
||||||
);
|
);
|
||||||
if (!rdma_context)
|
if (!rdma_contexts.size())
|
||||||
{
|
{
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
fprintf(stderr, "[OSD %ju] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
|
fprintf(stderr, "[OSD %ju] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge
|
|
||||||
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
|
||||||
fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
|
fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
|
||||||
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
for (msgr_rdma_context_t* rdma_context: rdma_contexts)
|
||||||
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
|
||||||
{
|
{
|
||||||
handle_rdma_events();
|
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
});
|
tfd->set_fd_handler(rdma_context->channel->fd, false, [this, rdma_context](int notify_fd, int epoll_events)
|
||||||
handle_rdma_events();
|
{
|
||||||
|
handle_rdma_events(rdma_context);
|
||||||
|
});
|
||||||
|
handle_rdma_events(rdma_context);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -247,10 +271,19 @@ osd_messenger_t::~osd_messenger_t()
|
||||||
iothreads.clear();
|
iothreads.clear();
|
||||||
}
|
}
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
if (rdma_context)
|
for (auto rdma_context: rdma_contexts)
|
||||||
{
|
{
|
||||||
delete rdma_context;
|
delete rdma_context;
|
||||||
}
|
}
|
||||||
|
rdma_contexts.clear();
|
||||||
|
#endif
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
if (rdmacm_evch)
|
||||||
|
{
|
||||||
|
tfd->set_fd_handler(rdmacm_evch->fd, false, NULL);
|
||||||
|
rdma_destroy_event_channel(rdmacm_evch);
|
||||||
|
rdmacm_evch = NULL;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -262,10 +295,14 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
// RDMA is on by default in RDMA-enabled builds
|
// RDMA is on by default in RDMA-enabled builds
|
||||||
this->use_rdma = config["use_rdma"].bool_value() || config["use_rdma"].uint64_value() != 0;
|
this->use_rdma = config["use_rdma"].bool_value() || config["use_rdma"].uint64_value() != 0;
|
||||||
}
|
}
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
// Use RDMA CM? (required for iWARP and may be useful for IB)
|
||||||
|
// FIXME: Only parse during start
|
||||||
|
this->use_rdmacm = config["use_rdmacm"].bool_value() || config["use_rdmacm"].uint64_value() != 0;
|
||||||
|
this->disable_tcp = this->use_rdmacm && (config["disable_tcp"].bool_value() || config["disable_tcp"].uint64_value() != 0);
|
||||||
|
#endif
|
||||||
this->rdma_device = config["rdma_device"].string_value();
|
this->rdma_device = config["rdma_device"].string_value();
|
||||||
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
|
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
|
||||||
if (!this->rdma_port_num)
|
|
||||||
this->rdma_port_num = 1;
|
|
||||||
if (!config["rdma_gid_index"].is_null())
|
if (!config["rdma_gid_index"].is_null())
|
||||||
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
||||||
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
||||||
|
@ -282,15 +319,6 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
this->rdma_max_msg = 129*1024;
|
this->rdma_max_msg = 129*1024;
|
||||||
this->rdma_odp = config["rdma_odp"].bool_value();
|
this->rdma_odp = config["rdma_odp"].bool_value();
|
||||||
std::vector<std::string> mask;
|
|
||||||
if (config["bind_address"].is_string())
|
|
||||||
mask.push_back(config["bind_address"].string_value());
|
|
||||||
else if (config["osd_network"].is_string())
|
|
||||||
mask.push_back(config["osd_network"].string_value());
|
|
||||||
else
|
|
||||||
for (auto v: config["osd_network"].array_items())
|
|
||||||
mask.push_back(v.string_value());
|
|
||||||
this->osd_networks = mask;
|
|
||||||
#endif
|
#endif
|
||||||
if (!osd_num)
|
if (!osd_num)
|
||||||
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
|
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
|
||||||
|
@ -314,23 +342,87 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
if (!this->osd_ping_timeout)
|
if (!this->osd_ping_timeout)
|
||||||
this->osd_ping_timeout = 5;
|
this->osd_ping_timeout = 5;
|
||||||
this->log_level = config["log_level"].uint64_value();
|
this->log_level = config["log_level"].uint64_value();
|
||||||
|
// OSD public & cluster networks
|
||||||
|
this->osd_networks.clear();
|
||||||
|
if (config["osd_network"].is_string())
|
||||||
|
this->osd_networks.push_back(config["osd_network"].string_value());
|
||||||
|
else
|
||||||
|
for (auto v: config["osd_network"].array_items())
|
||||||
|
this->osd_networks.push_back(v.string_value());
|
||||||
|
this->osd_cluster_networks.clear();
|
||||||
|
if (config["osd_cluster_network"].is_string())
|
||||||
|
this->osd_cluster_networks.push_back(config["osd_cluster_network"].string_value());
|
||||||
|
else
|
||||||
|
for (auto v: config["osd_cluster_network"].array_items())
|
||||||
|
this->osd_cluster_networks.push_back(v.string_value());
|
||||||
|
if (this->osd_cluster_networks.size())
|
||||||
|
for (auto & net: this->osd_cluster_networks)
|
||||||
|
for (int i = this->osd_networks.size()-1; i >= 0; i--)
|
||||||
|
if (this->osd_networks[i] == net)
|
||||||
|
this->osd_networks.erase(this->osd_networks.begin()+i, this->osd_networks.begin()+i+1);
|
||||||
|
this->osd_network_masks.clear();
|
||||||
|
for (auto & netstr: this->osd_networks)
|
||||||
|
this->osd_network_masks.push_back(cidr_parse(netstr));
|
||||||
|
this->osd_cluster_network_masks.clear();
|
||||||
|
for (auto & netstr: this->osd_cluster_networks)
|
||||||
|
this->osd_cluster_network_masks.push_back(cidr_parse(netstr));
|
||||||
|
this->all_osd_networks.clear();
|
||||||
|
this->all_osd_networks.insert(this->all_osd_networks.end(), this->osd_networks.begin(), this->osd_networks.end());
|
||||||
|
this->all_osd_networks.insert(this->all_osd_networks.end(), this->osd_cluster_networks.begin(), this->osd_cluster_networks.end());
|
||||||
|
this->all_osd_network_masks.clear();
|
||||||
|
this->all_osd_network_masks.insert(this->all_osd_network_masks.end(), this->osd_network_masks.begin(), this->osd_network_masks.end());
|
||||||
|
this->all_osd_network_masks.insert(this->all_osd_network_masks.end(), this->osd_cluster_network_masks.begin(), this->osd_cluster_network_masks.end());
|
||||||
|
if (!this->osd_networks.size())
|
||||||
|
{
|
||||||
|
this->osd_networks = this->osd_cluster_networks;
|
||||||
|
this->osd_network_masks = this->osd_cluster_network_masks;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
|
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
|
||||||
{
|
{
|
||||||
if (wanted_peers.find(peer_osd) == wanted_peers.end())
|
if (wanted_peers[peer_osd].raw_address_list != peer_state["addresses"])
|
||||||
{
|
{
|
||||||
wanted_peers[peer_osd] = (osd_wanted_peer_t){
|
wanted_peers[peer_osd].raw_address_list = peer_state["addresses"];
|
||||||
.address_list = peer_state["addresses"],
|
// We are an OSD -> try to select a cluster address
|
||||||
.port = (int)peer_state["port"].int64_value(),
|
// We are a client -> try to select a public address
|
||||||
};
|
// OSD only has 1 address -> don't try anything, it's pointless
|
||||||
|
// FIXME: Maybe support optional fallback from cluster to public network?
|
||||||
|
auto & match_masks = (this->osd_num ? osd_cluster_network_masks : osd_network_masks);
|
||||||
|
if (peer_state["addresses"].array_items().size() > 1 && match_masks.size())
|
||||||
|
{
|
||||||
|
json11::Json::array address_list;
|
||||||
|
for (auto json_addr: peer_state["addresses"].array_items())
|
||||||
|
{
|
||||||
|
struct sockaddr_storage addr;
|
||||||
|
auto ok = string_to_addr(json_addr.string_value(), false, 0, &addr);
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
bool matches = false;
|
||||||
|
for (auto & mask: match_masks)
|
||||||
|
{
|
||||||
|
if (cidr_sockaddr_match(addr, mask))
|
||||||
|
{
|
||||||
|
matches = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (matches)
|
||||||
|
address_list.push_back(json_addr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!address_list.size())
|
||||||
|
address_list = peer_state["addresses"].array_items();
|
||||||
|
wanted_peers[peer_osd].address_list = address_list;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
wanted_peers[peer_osd].address_list = peer_state["addresses"];
|
||||||
|
wanted_peers[peer_osd].address_changed = true;
|
||||||
}
|
}
|
||||||
else
|
#ifdef WITH_RDMACM
|
||||||
{
|
wanted_peers[peer_osd].rdmacm_port = (int)peer_state["rdmacm_port"].int64_value();
|
||||||
wanted_peers[peer_osd].address_list = peer_state["addresses"];
|
#endif
|
||||||
wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
|
wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
|
||||||
}
|
|
||||||
wanted_peers[peer_osd].address_changed = true;
|
|
||||||
try_connect_peer(peer_osd);
|
try_connect_peer(peer_osd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -355,12 +447,24 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
|
||||||
wp.cur_addr = wp.address_list[wp.address_index].string_value();
|
wp.cur_addr = wp.address_list[wp.address_index].string_value();
|
||||||
wp.cur_port = wp.port;
|
wp.cur_port = wp.port;
|
||||||
wp.connecting = true;
|
wp.connecting = true;
|
||||||
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
|
#ifdef WITH_RDMACM
|
||||||
|
if (use_rdmacm && wp.rdmacm_port)
|
||||||
|
rdmacm_try_connect_peer(peer_osd, wp.cur_addr.c_str(), wp.rdmacm_port, wp.cur_port);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
try_connect_peer_tcp(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
|
void osd_messenger_t::try_connect_peer_tcp(osd_num_t peer_osd, const char *peer_host, int peer_port)
|
||||||
{
|
{
|
||||||
assert(peer_osd != this->osd_num);
|
assert(peer_osd != this->osd_num);
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
if (disable_tcp)
|
||||||
|
{
|
||||||
|
on_connect_peer(peer_osd, -EINVAL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
struct sockaddr_storage addr;
|
struct sockaddr_storage addr;
|
||||||
if (!string_to_addr(peer_host, 0, peer_port, &addr))
|
if (!string_to_addr(peer_host, 0, peer_port, &addr))
|
||||||
{
|
{
|
||||||
|
@ -524,20 +628,30 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
if (rdma_context)
|
if (!use_rdmacm && rdma_contexts.size())
|
||||||
{
|
{
|
||||||
cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
|
// Choose the right context for the selected network
|
||||||
if (cl->rdma_conn)
|
msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl);
|
||||||
|
if (!selected_ctx)
|
||||||
{
|
{
|
||||||
json11::Json payload = json11::Json::object {
|
if (log_level > 0)
|
||||||
{ "connect_rdma", cl->rdma_conn->addr.to_string() },
|
fprintf(stderr, "No RDMA context for OSD %ju connection (peer %d), using only TCP\n", cl->osd_num, cl->peer_fd);
|
||||||
{ "rdma_max_msg", cl->rdma_conn->max_msg },
|
}
|
||||||
};
|
else
|
||||||
std::string payload_str = payload.dump();
|
{
|
||||||
op->req.show_conf.json_len = payload_str.size();
|
cl->rdma_conn = msgr_rdma_connection_t::create(selected_ctx, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
|
||||||
op->buf = malloc_or_die(payload_str.size());
|
if (cl->rdma_conn)
|
||||||
op->iov.push_back(op->buf, payload_str.size());
|
{
|
||||||
memcpy(op->buf, payload_str.c_str(), payload_str.size());
|
json11::Json payload = json11::Json::object {
|
||||||
|
{ "connect_rdma", cl->rdma_conn->addr.to_string() },
|
||||||
|
{ "rdma_max_msg", cl->rdma_conn->max_msg },
|
||||||
|
};
|
||||||
|
std::string payload_str = payload.dump();
|
||||||
|
op->req.show_conf.json_len = payload_str.size();
|
||||||
|
op->buf = malloc_or_die(payload_str.size());
|
||||||
|
op->iov.push_back(op->buf, payload_str.size());
|
||||||
|
memcpy(op->buf, payload_str.c_str(), payload_str.size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -582,29 +696,23 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
osd_num_t peer_osd = cl->osd_num;
|
osd_num_t peer_osd = cl->osd_num;
|
||||||
stop_client(op->peer_fd);
|
stop_client(op->peer_fd);
|
||||||
on_connect_peer(peer_osd, -1);
|
on_connect_peer(peer_osd, -EINVAL);
|
||||||
delete op;
|
delete op;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
if (config["rdma_address"].is_string())
|
if (!use_rdmacm && cl->rdma_conn && config["rdma_address"].is_string())
|
||||||
{
|
{
|
||||||
msgr_rdma_address_t addr;
|
msgr_rdma_address_t addr;
|
||||||
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
|
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
|
||||||
cl->rdma_conn->connect(&addr) != 0)
|
cl->rdma_conn->connect(&addr) != 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to connect to OSD %ju (address %s) using RDMA\n",
|
stderr, "Failed to connect to OSD %ju (address %s) using RDMA, switching back to TCP\n",
|
||||||
cl->osd_num, config["rdma_address"].string_value().c_str()
|
cl->osd_num, config["rdma_address"].string_value().c_str()
|
||||||
);
|
);
|
||||||
delete cl->rdma_conn;
|
delete cl->rdma_conn;
|
||||||
cl->rdma_conn = NULL;
|
cl->rdma_conn = NULL;
|
||||||
// FIXME: Keep TCP connection in this case
|
|
||||||
osd_num_t peer_osd = cl->osd_num;
|
|
||||||
stop_client(cl->peer_fd);
|
|
||||||
on_connect_peer(peer_osd, -1);
|
|
||||||
delete op;
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -673,9 +781,29 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
|
msgr_rdma_context_t* osd_messenger_t::choose_rdma_context(osd_client_t *cl)
|
||||||
|
{
|
||||||
|
// Choose the right context for the selected network
|
||||||
|
msgr_rdma_context_t *selected_ctx = NULL;
|
||||||
|
for (auto rdma_ctx: rdma_contexts)
|
||||||
|
{
|
||||||
|
if (!rdma_ctx->net_mask.family && !selected_ctx ||
|
||||||
|
rdma_ctx->net_mask.family && cidr_sockaddr_match(cl->peer_addr, rdma_ctx->net_mask))
|
||||||
|
{
|
||||||
|
selected_ctx = rdma_ctx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return selected_ctx;
|
||||||
|
}
|
||||||
|
|
||||||
bool osd_messenger_t::is_rdma_enabled()
|
bool osd_messenger_t::is_rdma_enabled()
|
||||||
{
|
{
|
||||||
return rdma_context != NULL;
|
return rdma_contexts.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool osd_messenger_t::is_use_rdmacm()
|
||||||
|
{
|
||||||
|
return use_rdmacm;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
#include "json11/json11.hpp"
|
#include "json11/json11.hpp"
|
||||||
#include "msgr_op.h"
|
#include "msgr_op.h"
|
||||||
#include "timerfd_manager.h"
|
#include "timerfd_manager.h"
|
||||||
|
#include "addr_util.h"
|
||||||
#include <ringloop.h>
|
#include <ringloop.h>
|
||||||
|
|
||||||
#define CL_READ_HDR 1
|
#define CL_READ_HDR 1
|
||||||
|
@ -49,10 +50,10 @@ struct osd_client_t
|
||||||
{
|
{
|
||||||
int refs = 0;
|
int refs = 0;
|
||||||
|
|
||||||
sockaddr_storage peer_addr;
|
sockaddr_storage peer_addr = {};
|
||||||
int peer_port;
|
int peer_port = 0;
|
||||||
int peer_fd = -1;
|
int peer_fd = -1;
|
||||||
int peer_state;
|
int peer_state = 0;
|
||||||
int connect_timeout_id = -1;
|
int connect_timeout_id = -1;
|
||||||
int ping_time_remaining = 0;
|
int ping_time_remaining = 0;
|
||||||
int idle_time_remaining = 0;
|
int idle_time_remaining = 0;
|
||||||
|
@ -93,13 +94,17 @@ struct osd_client_t
|
||||||
|
|
||||||
struct osd_wanted_peer_t
|
struct osd_wanted_peer_t
|
||||||
{
|
{
|
||||||
|
json11::Json raw_address_list;
|
||||||
json11::Json address_list;
|
json11::Json address_list;
|
||||||
int port;
|
int port = 0;
|
||||||
time_t last_connect_attempt;
|
#ifdef WITH_RDMACM
|
||||||
bool connecting, address_changed;
|
int rdmacm_port = 0;
|
||||||
int address_index;
|
#endif
|
||||||
|
time_t last_connect_attempt = 0;
|
||||||
|
bool connecting = false, address_changed = false;
|
||||||
|
int address_index = 0;
|
||||||
std::string cur_addr;
|
std::string cur_addr;
|
||||||
int cur_port;
|
int cur_port = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct osd_op_stats_t
|
struct osd_op_stats_t
|
||||||
|
@ -149,6 +154,15 @@ public:
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
struct rdma_event_channel;
|
||||||
|
struct rdma_cm_id;
|
||||||
|
struct rdma_cm_event;
|
||||||
|
struct ibv_context;
|
||||||
|
struct osd_messenger_t;
|
||||||
|
struct rdmacm_connecting_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
struct osd_messenger_t
|
struct osd_messenger_t
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
|
@ -165,14 +179,19 @@ protected:
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
bool use_rdma = true;
|
bool use_rdma = true;
|
||||||
std::vector<std::string> osd_networks;
|
bool use_rdmacm = false;
|
||||||
|
bool disable_tcp = false;
|
||||||
std::string rdma_device;
|
std::string rdma_device;
|
||||||
uint64_t rdma_port_num = 1, rdma_mtu = 0;
|
uint64_t rdma_port_num = 1;
|
||||||
|
int rdma_mtu = 0;
|
||||||
int rdma_gid_index = -1;
|
int rdma_gid_index = -1;
|
||||||
msgr_rdma_context_t *rdma_context = NULL;
|
std::vector<msgr_rdma_context_t *> rdma_contexts;
|
||||||
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
||||||
uint64_t rdma_max_msg = 0;
|
uint64_t rdma_max_msg = 0;
|
||||||
bool rdma_odp = false;
|
bool rdma_odp = false;
|
||||||
|
rdma_event_channel *rdmacm_evch = NULL;
|
||||||
|
std::map<rdma_cm_id*, osd_client_t*> rdmacm_connections;
|
||||||
|
std::map<rdma_cm_id*, rdmacm_connecting_t*> rdmacm_connecting;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<msgr_iothread_t*> iothreads;
|
std::vector<msgr_iothread_t*> iothreads;
|
||||||
|
@ -190,6 +209,12 @@ public:
|
||||||
std::map<int, osd_client_t*> clients;
|
std::map<int, osd_client_t*> clients;
|
||||||
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
||||||
std::map<uint64_t, int> osd_peer_fds;
|
std::map<uint64_t, int> osd_peer_fds;
|
||||||
|
std::vector<std::string> osd_networks;
|
||||||
|
std::vector<addr_mask_t> osd_network_masks;
|
||||||
|
std::vector<std::string> osd_cluster_networks;
|
||||||
|
std::vector<addr_mask_t> osd_cluster_network_masks;
|
||||||
|
std::vector<std::string> all_osd_networks;
|
||||||
|
std::vector<addr_mask_t> all_osd_network_masks;
|
||||||
// op statistics
|
// op statistics
|
||||||
osd_op_stats_t stats, recovery_stats;
|
osd_op_stats_t stats, recovery_stats;
|
||||||
|
|
||||||
|
@ -216,13 +241,18 @@ public:
|
||||||
bool is_rdma_enabled();
|
bool is_rdma_enabled();
|
||||||
bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
|
bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
bool is_use_rdmacm();
|
||||||
|
rdma_cm_id *rdmacm_listen(const std::string & bind_address, int rdmacm_port, int *bound_port, int log_level);
|
||||||
|
void rdmacm_destroy_listener(rdma_cm_id *listener);
|
||||||
|
#endif
|
||||||
|
|
||||||
void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len);
|
void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len);
|
||||||
void measure_exec(osd_op_t *cur_op);
|
void measure_exec(osd_op_t *cur_op);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void try_connect_peer(uint64_t osd_num);
|
void try_connect_peer(uint64_t osd_num);
|
||||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
void try_connect_peer_tcp(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||||
void handle_peer_epoll(int peer_fd, int epoll_events);
|
void handle_peer_epoll(int peer_fd, int epoll_events);
|
||||||
void handle_connect_epoll(int peer_fd);
|
void handle_connect_epoll(int peer_fd);
|
||||||
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
|
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
|
||||||
|
@ -247,6 +277,18 @@ protected:
|
||||||
void try_send_rdma_odp(osd_client_t *cl);
|
void try_send_rdma_odp(osd_client_t *cl);
|
||||||
void try_send_rdma_nodp(osd_client_t *cl);
|
void try_send_rdma_nodp(osd_client_t *cl);
|
||||||
bool try_recv_rdma(osd_client_t *cl);
|
bool try_recv_rdma(osd_client_t *cl);
|
||||||
void handle_rdma_events();
|
void handle_rdma_events(msgr_rdma_context_t *rdma_context);
|
||||||
|
msgr_rdma_context_t* choose_rdma_context(osd_client_t *cl);
|
||||||
|
#endif
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
void handle_rdmacm_events();
|
||||||
|
msgr_rdma_context_t* rdmacm_get_context(ibv_context *verbs);
|
||||||
|
msgr_rdma_context_t* rdmacm_create_qp(rdma_cm_id *cmid);
|
||||||
|
void rdmacm_accept(rdma_cm_event *ev);
|
||||||
|
void rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port);
|
||||||
|
void rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res);
|
||||||
|
void rdmacm_address_resolved(rdma_cm_event *ev);
|
||||||
|
void rdmacm_route_resolved(rdma_cm_event *ev);
|
||||||
|
void rdmacm_established(rdma_cm_event *ev);
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
|
@ -3,10 +3,35 @@
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include "addr_util.h"
|
|
||||||
#include "msgr_rdma.h"
|
#include "msgr_rdma.h"
|
||||||
#include "messenger.h"
|
#include "messenger.h"
|
||||||
|
|
||||||
|
static uint32_t ibv_mtu_to_bytes(ibv_mtu mtu)
|
||||||
|
{
|
||||||
|
switch (mtu)
|
||||||
|
{
|
||||||
|
case IBV_MTU_256: return 256;
|
||||||
|
case IBV_MTU_512: return 512;
|
||||||
|
case IBV_MTU_1024: return 1024;
|
||||||
|
case IBV_MTU_2048: return 2048;
|
||||||
|
case IBV_MTU_4096: return 4096;
|
||||||
|
}
|
||||||
|
return 4096;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ibv_mtu bytes_to_ibv_mtu(uint32_t mtu)
|
||||||
|
{
|
||||||
|
switch (mtu)
|
||||||
|
{
|
||||||
|
case 256: return IBV_MTU_256;
|
||||||
|
case 512: return IBV_MTU_512;
|
||||||
|
case 1024: return IBV_MTU_1024;
|
||||||
|
case 2048: return IBV_MTU_2048;
|
||||||
|
case 4096: return IBV_MTU_4096;
|
||||||
|
}
|
||||||
|
return IBV_MTU_4096;
|
||||||
|
}
|
||||||
|
|
||||||
std::string msgr_rdma_address_t::to_string()
|
std::string msgr_rdma_address_t::to_string()
|
||||||
{
|
{
|
||||||
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
|
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
|
||||||
|
@ -38,15 +63,22 @@ msgr_rdma_context_t::~msgr_rdma_context_t()
|
||||||
ibv_dereg_mr(mr);
|
ibv_dereg_mr(mr);
|
||||||
if (pd)
|
if (pd)
|
||||||
ibv_dealloc_pd(pd);
|
ibv_dealloc_pd(pd);
|
||||||
if (context)
|
if (context && !is_cm)
|
||||||
ibv_close_device(context);
|
ibv_close_device(context);
|
||||||
}
|
}
|
||||||
|
|
||||||
msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||||
{
|
{
|
||||||
ctx->used_max_cqe -= max_send+max_recv;
|
ctx->reserve_cqe(-max_send-max_recv);
|
||||||
if (qp)
|
if (qp && !cmid)
|
||||||
ibv_destroy_qp(qp);
|
ibv_destroy_qp(qp);
|
||||||
|
if (cmid)
|
||||||
|
{
|
||||||
|
ctx->cm_refs--;
|
||||||
|
if (cmid->qp)
|
||||||
|
rdma_destroy_qp(cmid);
|
||||||
|
rdma_destroy_id(cmid);
|
||||||
|
}
|
||||||
if (recv_buffers.size())
|
if (recv_buffers.size())
|
||||||
{
|
{
|
||||||
for (auto b: recv_buffers)
|
for (auto b: recv_buffers)
|
||||||
|
@ -77,21 +109,21 @@ static bool is_ipv4_gid(ibv_gid_entry *gidx)
|
||||||
((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
|
((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
|
static int match_gid(ibv_gid_entry *gidx, const addr_mask_t *networks, int nnet)
|
||||||
{
|
{
|
||||||
if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
|
if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
|
||||||
gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
|
gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
|
||||||
((uint64_t*)gidx->gid.raw)[0] == 0 &&
|
((uint64_t*)gidx->gid.raw)[0] == 0 &&
|
||||||
((uint64_t*)gidx->gid.raw)[1] == 0)
|
((uint64_t*)gidx->gid.raw)[1] == 0)
|
||||||
{
|
{
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
if (is_ipv4_gid(gidx))
|
if (is_ipv4_gid(gidx))
|
||||||
{
|
{
|
||||||
for (int i = 0; i < nnet; i++)
|
for (int i = 0; i < nnet; i++)
|
||||||
{
|
{
|
||||||
if (networks[i].family == AF_INET && cidr_match(*(in_addr*)(gidx->gid.raw+12), networks[i].ipv4, networks[i].bits))
|
if (networks[i].family == AF_INET && cidr_match(*(in_addr*)(gidx->gid.raw+12), networks[i].ipv4, networks[i].bits))
|
||||||
return true;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -99,119 +131,67 @@ static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
|
||||||
for (int i = 0; i < nnet; i++)
|
for (int i = 0; i < nnet; i++)
|
||||||
{
|
{
|
||||||
if (networks[i].family == AF_INET6 && cidr6_match(*(in6_addr*)gidx->gid.raw, networks[i].ipv6, networks[i].bits))
|
if (networks[i].family == AF_INET6 && cidr6_match(*(in6_addr*)gidx->gid.raw, networks[i].ipv6, networks[i].bits))
|
||||||
return true;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct matched_dev
|
static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, int mtu, ibv_gid_entry & gidx)
|
||||||
{
|
|
||||||
int dev = -1;
|
|
||||||
int port = -1;
|
|
||||||
int gid = -1;
|
|
||||||
bool rocev2 = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, ibv_gid_entry & gidx)
|
|
||||||
{
|
{
|
||||||
bool is4 = ((uint64_t*)gidx.gid.raw)[0] == 0 && ((uint32_t*)gidx.gid.raw)[2] == 0xffff0000;
|
bool is4 = ((uint64_t*)gidx.gid.raw)[0] == 0 && ((uint32_t*)gidx.gid.raw)[2] == 0xffff0000;
|
||||||
char buf[256];
|
char buf[256];
|
||||||
inet_ntop(is4 ? AF_INET : AF_INET6, is4 ? gidx.gid.raw+12 : gidx.gid.raw, buf, sizeof(buf));
|
inet_ntop(is4 ? AF_INET : AF_INET6, is4 ? gidx.gid.raw+12 : gidx.gid.raw, buf, sizeof(buf));
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Auto-selected RDMA device %s port %d GID %d - ROCEv%d IPv%d %s\n",
|
stderr, "Selected RDMA device %s port %d GID %d - ROCEv%d IPv%d %s, MTU %d\n",
|
||||||
ibv_get_device_name(dev), ib_port, gid_index,
|
ibv_get_device_name(dev), ib_port, gid_index,
|
||||||
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 ? 2 : 1, is4 ? 4 : 6, buf
|
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 ? 2 : 1, is4 ? 4 : 6, buf, mtu
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
static matched_dev match_device(ibv_device **dev_list, addr_mask_t *networks, int nnet, int log_level)
|
static int match_port_gid(const std::vector<addr_mask_t> & osd_network_masks, ibv_context *context,
|
||||||
|
int port_num, int gid_count, int log_level, ibv_gid_entry *best_gidx, int *net_num)
|
||||||
{
|
{
|
||||||
matched_dev best;
|
// Try to find a port with matching address
|
||||||
ibv_device_attr attr;
|
int best_gid_idx = -1, res = 0;
|
||||||
ibv_port_attr portinfo;
|
for (int k = 0; k < gid_count; k++)
|
||||||
ibv_gid_entry best_gidx;
|
|
||||||
int res;
|
|
||||||
bool have_non_roce = false, have_roce = false;
|
|
||||||
for (int i = 0; dev_list[i]; ++i)
|
|
||||||
{
|
{
|
||||||
auto dev = dev_list[i];
|
ibv_gid_entry gidx;
|
||||||
ibv_context *context = ibv_open_device(dev_list[i]);
|
if ((res = ibv_query_gid_ex(context, port_num, k, &gidx, 0)) != 0)
|
||||||
if ((res = ibv_query_device(context, &attr)) != 0)
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(dev_list[i]), strerror(res));
|
if (res != ENODATA)
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
for (int j = 1; j <= attr.phys_port_cnt; j++)
|
|
||||||
{
|
|
||||||
// Try to find a port with matching address
|
|
||||||
if ((res = ibv_query_port(context, j, &portinfo)) != 0)
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(dev), j, strerror(res));
|
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(context->device), k, strerror(res));
|
||||||
goto cleanup;
|
continue;
|
||||||
}
|
}
|
||||||
for (int k = 0; k < portinfo.gid_tbl_len; k++)
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if ((res = match_gid(&gidx, osd_network_masks.data(), osd_network_masks.size())) >= 0)
|
||||||
|
{
|
||||||
|
// Prefer RoCEv2
|
||||||
|
if (best_gid_idx < 0 || best_gidx->gid_type != IBV_GID_TYPE_ROCE_V2 && gidx.gid_type == IBV_GID_TYPE_ROCE_V2)
|
||||||
{
|
{
|
||||||
ibv_gid_entry gidx;
|
best_gid_idx = k;
|
||||||
if ((res = ibv_query_gid_ex(context, j, k, &gidx, 0)) != 0)
|
*best_gidx = gidx;
|
||||||
{
|
*net_num = res;
|
||||||
if (res != ENODATA)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(dev), k, strerror(res));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (gidx.gid_type != IBV_GID_TYPE_ROCE_V1 &&
|
|
||||||
gidx.gid_type != IBV_GID_TYPE_ROCE_V2)
|
|
||||||
have_non_roce = true;
|
|
||||||
else
|
|
||||||
have_roce = true;
|
|
||||||
if (match_gid(&gidx, networks, nnet))
|
|
||||||
{
|
|
||||||
// Prefer RoCEv2
|
|
||||||
if (!best.rocev2)
|
|
||||||
{
|
|
||||||
best.dev = i;
|
|
||||||
best.port = j;
|
|
||||||
best.gid = k;
|
|
||||||
best.rocev2 = (gidx.gid_type == IBV_GID_TYPE_ROCE_V2);
|
|
||||||
best_gidx = gidx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cleanup:
|
|
||||||
ibv_close_device(context);
|
|
||||||
if (best.rocev2)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (best.dev >= 0 && log_level > 0)
|
return best_gid_idx;
|
||||||
{
|
|
||||||
log_rdma_dev_port_gid(dev_list[best.dev], best.port, best.gid, best_gidx);
|
|
||||||
}
|
|
||||||
if (best.dev < 0 && have_non_roce && !have_roce)
|
|
||||||
{
|
|
||||||
best.dev = -2;
|
|
||||||
}
|
|
||||||
return best;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, int gid_index, uint32_t mtu, bool odp, int log_level)
|
std::vector<msgr_rdma_context_t*> msgr_rdma_context_t::create_all(const std::vector<addr_mask_t> & osd_network_masks,
|
||||||
|
const char *sel_dev_name, int sel_port_num, int sel_gid_index, uint32_t sel_mtu, bool odp, int log_level)
|
||||||
{
|
{
|
||||||
int res;
|
int res;
|
||||||
|
std::vector<msgr_rdma_context_t*> ret;
|
||||||
|
ibv_device **raw_dev_list = NULL;
|
||||||
ibv_device **dev_list = NULL;
|
ibv_device **dev_list = NULL;
|
||||||
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
|
ibv_device *single_list[2] = {};
|
||||||
ctx->mtu = mtu;
|
|
||||||
|
|
||||||
timespec tv;
|
raw_dev_list = dev_list = ibv_get_device_list(NULL);
|
||||||
clock_gettime(CLOCK_REALTIME, &tv);
|
|
||||||
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
|
|
||||||
dev_list = ibv_get_device_list(NULL);
|
|
||||||
if (!dev_list || !*dev_list)
|
if (!dev_list || !*dev_list)
|
||||||
{
|
{
|
||||||
if (errno == -ENOSYS || errno == ENOSYS)
|
if (errno == -ENOSYS || errno == ENOSYS)
|
||||||
|
@ -228,121 +208,131 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
|
||||||
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
|
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
if (ib_devname)
|
|
||||||
|
if (sel_dev_name)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; dev_list[i]; ++i)
|
for (i = 0; dev_list[i]; ++i)
|
||||||
if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname))
|
if (!strcmp(ibv_get_device_name(dev_list[i]), sel_dev_name))
|
||||||
break;
|
break;
|
||||||
ctx->dev = dev_list[i];
|
if (!dev_list[i])
|
||||||
if (!ctx->dev)
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "RDMA device %s not found\n", ib_devname);
|
fprintf(stderr, "RDMA device %s not found\n", sel_dev_name);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
single_list[0] = dev_list[i];
|
||||||
#ifdef IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
|
dev_list = single_list;
|
||||||
else if (osd_networks.size())
|
|
||||||
{
|
|
||||||
std::vector<addr_mask_t> nets;
|
|
||||||
for (auto & netstr: osd_networks)
|
|
||||||
{
|
|
||||||
nets.push_back(cidr_parse(netstr));
|
|
||||||
}
|
|
||||||
auto best = match_device(dev_list, nets.data(), nets.size(), log_level);
|
|
||||||
if (best.dev == -2)
|
|
||||||
{
|
|
||||||
best.dev = 0;
|
|
||||||
if (log_level > 0)
|
|
||||||
fprintf(stderr, "No RoCE devices found, using first available RDMA device %s\n", ibv_get_device_name(*dev_list));
|
|
||||||
}
|
|
||||||
else if (best.dev < 0)
|
|
||||||
{
|
|
||||||
if (log_level > 0)
|
|
||||||
fprintf(stderr, "RDMA device matching osd_network is not found, disabling RDMA\n");
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ib_port = best.port;
|
|
||||||
gid_index = best.gid;
|
|
||||||
}
|
|
||||||
ctx->dev = dev_list[best.dev];
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ctx->dev = *dev_list;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->context = ibv_open_device(ctx->dev);
|
for (int i = 0; dev_list[i]; ++i)
|
||||||
if (!ctx->context)
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't get RDMA context for %s\n", ibv_get_device_name(ctx->dev));
|
auto dev = dev_list[i];
|
||||||
goto cleanup;
|
ibv_context *context = ibv_open_device(dev);
|
||||||
}
|
if (!context)
|
||||||
|
|
||||||
ctx->ib_port = ib_port;
|
|
||||||
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
ctx->my_lid = ctx->portinfo.lid;
|
|
||||||
if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx->my_lid)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
|
|
||||||
if (gid_index != -1)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
ctx->gid_index = gid_index < 0 ? 0 : gid_index;
|
|
||||||
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
|
fprintf(stderr, "Couldn't get RDMA context for %s\n", ibv_get_device_name(dev));
|
||||||
goto cleanup;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
ibv_device_attr attr;
|
||||||
#ifdef IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
|
if ((res = ibv_query_device(context, &attr)) != 0)
|
||||||
else
|
|
||||||
{
|
|
||||||
// Auto-guess GID
|
|
||||||
ibv_gid_entry best_gidx;
|
|
||||||
for (int k = 0; k < ctx->portinfo.gid_tbl_len; k++)
|
|
||||||
{
|
{
|
||||||
ibv_gid_entry gidx;
|
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(dev), strerror(res));
|
||||||
if (ibv_query_gid_ex(ctx->context, ib_port, k, &gidx, 0) != 0)
|
goto cleanup_dev;
|
||||||
|
}
|
||||||
|
if (sel_port_num && sel_port_num > attr.phys_port_cnt)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "RDMA device %s port %d does not exist\n", ibv_get_device_name(dev), sel_port_num);
|
||||||
|
goto cleanup_dev;
|
||||||
|
}
|
||||||
|
for (int port_num = (sel_port_num ? sel_port_num : 1); port_num <= (sel_port_num ? sel_port_num : attr.phys_port_cnt); port_num++)
|
||||||
|
{
|
||||||
|
ibv_port_attr portinfo;
|
||||||
|
if ((res = ibv_query_port(context, port_num, &portinfo)) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), k);
|
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(dev), port_num, strerror(res));
|
||||||
goto cleanup;
|
continue;
|
||||||
}
|
}
|
||||||
// Skip empty GID
|
if (portinfo.state != IBV_PORT_ACTIVE)
|
||||||
if (((uint64_t*)gidx.gid.raw)[0] == 0 &&
|
|
||||||
((uint64_t*)gidx.gid.raw)[1] == 0)
|
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Prefer IPv4 RoCEv2 -> IPv6 RoCEv2 -> IPv4 RoCEv1 -> IPv6 RoCEv1 -> IB
|
if (sel_gid_index >= (int)portinfo.gid_tbl_len)
|
||||||
if (gid_index == -1 ||
|
|
||||||
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 && best_gidx.gid_type != IBV_GID_TYPE_ROCE_V2 ||
|
|
||||||
gidx.gid_type == IBV_GID_TYPE_ROCE_V1 && best_gidx.gid_type == IBV_GID_TYPE_IB ||
|
|
||||||
gidx.gid_type == best_gidx.gid_type && is_ipv4_gid(&gidx))
|
|
||||||
{
|
{
|
||||||
gid_index = k;
|
fprintf(stderr, "RDMA device %s port %d GID %d does not exist\n", ibv_get_device_name(dev), port_num, sel_gid_index);
|
||||||
best_gidx = gidx;
|
continue;
|
||||||
|
}
|
||||||
|
uint32_t port_mtu = sel_mtu ? sel_mtu : ibv_mtu_to_bytes(portinfo.active_mtu);
|
||||||
|
#ifdef IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
|
||||||
|
if (sel_gid_index < 0)
|
||||||
|
{
|
||||||
|
ibv_gid_entry best_gidx;
|
||||||
|
int net_num = 0;
|
||||||
|
int best_gid_idx = match_port_gid(osd_network_masks, context, port_num, portinfo.gid_tbl_len, log_level, &best_gidx, &net_num);
|
||||||
|
if (best_gid_idx >= 0)
|
||||||
|
{
|
||||||
|
if (log_level > 0)
|
||||||
|
log_rdma_dev_port_gid(dev, port_num, best_gid_idx, port_mtu, best_gidx);
|
||||||
|
auto ctx = msgr_rdma_context_t::create(dev, portinfo, port_num, best_gid_idx, port_mtu, odp, log_level);
|
||||||
|
if (ctx)
|
||||||
|
{
|
||||||
|
ctx->net_mask = osd_network_masks[net_num];
|
||||||
|
ret.push_back(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
int best_gid_idx = sel_gid_index >= 0 ? sel_gid_index : 0;
|
||||||
|
#ifdef IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
ibv_gid_entry gidx;
|
||||||
|
ibv_query_gid_ex(context, port_num, best_gid_idx, &gidx, 0);
|
||||||
|
log_rdma_dev_port_gid(dev, port_num, best_gid_idx, port_mtu, gidx);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
auto ctx = msgr_rdma_context_t::create(dev, portinfo, port_num, best_gid_idx, port_mtu, odp, log_level);
|
||||||
|
if (ctx)
|
||||||
|
ret.push_back(ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ctx->gid_index = gid_index = (gid_index == -1 ? 0 : gid_index);
|
cleanup_dev:
|
||||||
if (log_level > 0)
|
ibv_close_device(context);
|
||||||
{
|
}
|
||||||
log_rdma_dev_port_gid(ctx->dev, ctx->ib_port, ctx->gid_index, best_gidx);
|
|
||||||
}
|
cleanup:
|
||||||
ctx->my_gid = best_gidx.gid;
|
if (raw_dev_list)
|
||||||
|
ibv_free_device_list(raw_dev_list);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_context_t *msgr_rdma_context_t::create(ibv_device *dev, ibv_port_attr & portinfo, int ib_port, int gid_index, uint32_t mtu, bool odp, int log_level)
|
||||||
|
{
|
||||||
|
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
|
||||||
|
ibv_context *context = ibv_open_device(dev);
|
||||||
|
if (!context)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't get RDMA context for %s\n", ibv_get_device_name(dev));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->mtu = mtu;
|
||||||
|
ctx->context = context;
|
||||||
|
ctx->ib_port = ib_port;
|
||||||
|
ctx->portinfo = portinfo;
|
||||||
|
ctx->my_lid = ctx->portinfo.lid;
|
||||||
|
if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx->my_lid)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(dev));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
ctx->gid_index = gid_index;
|
||||||
|
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(dev), gid_index);
|
||||||
|
goto cleanup;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
ctx->pd = ibv_alloc_pd(ctx->context);
|
ctx->pd = ibv_alloc_pd(ctx->context);
|
||||||
if (!ctx->pd)
|
if (!ctx->pd)
|
||||||
|
@ -351,18 +341,19 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx))
|
||||||
{
|
{
|
||||||
if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx))
|
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||||
{
|
goto cleanup;
|
||||||
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
}
|
||||||
goto cleanup;
|
|
||||||
}
|
ctx->odp = odp;
|
||||||
ctx->odp = odp;
|
if (ctx->odp)
|
||||||
if (ctx->odp &&
|
{
|
||||||
(!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
||||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
||||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
|
||||||
{
|
{
|
||||||
ctx->odp = false;
|
ctx->odp = false;
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
|
@ -395,20 +386,43 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dev_list)
|
|
||||||
ibv_free_device_list(dev_list);
|
|
||||||
return ctx;
|
return ctx;
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
|
if (context)
|
||||||
|
ibv_close_device(context);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
if (dev_list)
|
|
||||||
ibv_free_device_list(dev_list);
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool msgr_rdma_context_t::reserve_cqe(int n)
|
||||||
|
{
|
||||||
|
this->used_max_cqe += n;
|
||||||
|
if (this->used_max_cqe > this->max_cqe)
|
||||||
|
{
|
||||||
|
// Resize CQ
|
||||||
|
// Mellanox ConnectX-4 supports up to 4194303 CQEs, so it's fine to put everything into a single CQ
|
||||||
|
int new_max_cqe = this->max_cqe;
|
||||||
|
while (this->used_max_cqe > new_max_cqe)
|
||||||
|
{
|
||||||
|
new_max_cqe *= 2;
|
||||||
|
}
|
||||||
|
if (ibv_resize_cq(this->cq, new_max_cqe) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't resize RDMA completion queue to %d entries\n", new_max_cqe);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
this->max_cqe = new_max_cqe;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send,
|
msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send,
|
||||||
uint32_t max_recv, uint32_t max_sge, uint32_t max_msg)
|
uint32_t max_recv, uint32_t max_sge, uint32_t max_msg)
|
||||||
{
|
{
|
||||||
|
if (!ctx->reserve_cqe(max_send+max_recv))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
|
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
|
||||||
|
|
||||||
max_sge = max_sge > ctx->attrx.orig_attr.max_sge ? ctx->attrx.orig_attr.max_sge : max_sge;
|
max_sge = max_sge > ctx->attrx.orig_attr.max_sge ? ctx->attrx.orig_attr.max_sge : max_sge;
|
||||||
|
@ -419,25 +433,6 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
|
||||||
conn->max_sge = max_sge;
|
conn->max_sge = max_sge;
|
||||||
conn->max_msg = max_msg;
|
conn->max_msg = max_msg;
|
||||||
|
|
||||||
ctx->used_max_cqe += max_send+max_recv;
|
|
||||||
if (ctx->used_max_cqe > ctx->max_cqe)
|
|
||||||
{
|
|
||||||
// Resize CQ
|
|
||||||
// Mellanox ConnectX-4 supports up to 4194303 CQEs, so it's fine to put everything into a single CQ
|
|
||||||
int new_max_cqe = ctx->max_cqe;
|
|
||||||
while (ctx->used_max_cqe > new_max_cqe)
|
|
||||||
{
|
|
||||||
new_max_cqe *= 2;
|
|
||||||
}
|
|
||||||
if (ibv_resize_cq(ctx->cq, new_max_cqe) != 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Couldn't resize RDMA completion queue to %d entries\n", new_max_cqe);
|
|
||||||
delete conn;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
ctx->max_cqe = new_max_cqe;
|
|
||||||
}
|
|
||||||
|
|
||||||
ibv_qp_init_attr init_attr = {
|
ibv_qp_init_attr init_attr = {
|
||||||
.send_cq = ctx->cq,
|
.send_cq = ctx->cq,
|
||||||
.recv_cq = ctx->cq,
|
.recv_cq = ctx->cq,
|
||||||
|
@ -469,9 +464,10 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
|
||||||
.port_num = ctx->ib_port,
|
.port_num = ctx->ib_port,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS))
|
int r = 0;
|
||||||
|
if ((r = ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to switch RDMA queue pair to INIT state\n");
|
fprintf(stderr, "Failed to switch RDMA queue pair to INIT state: %s (code %d)\n", strerror(r), r);
|
||||||
delete conn;
|
delete conn;
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -479,25 +475,12 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
|
||||||
return conn;
|
return conn;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
|
|
||||||
{
|
|
||||||
switch (mtu)
|
|
||||||
{
|
|
||||||
case 256: return IBV_MTU_256;
|
|
||||||
case 512: return IBV_MTU_512;
|
|
||||||
case 1024: return IBV_MTU_1024;
|
|
||||||
case 2048: return IBV_MTU_2048;
|
|
||||||
case 4096: return IBV_MTU_4096;
|
|
||||||
}
|
|
||||||
return IBV_MTU_4096;
|
|
||||||
}
|
|
||||||
|
|
||||||
int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
|
int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
|
||||||
{
|
{
|
||||||
auto conn = this;
|
auto conn = this;
|
||||||
ibv_qp_attr attr = {
|
ibv_qp_attr attr = {
|
||||||
.qp_state = IBV_QPS_RTR,
|
.qp_state = IBV_QPS_RTR,
|
||||||
.path_mtu = mtu_to_ibv_mtu(conn->ctx->mtu),
|
.path_mtu = bytes_to_ibv_mtu(conn->ctx->mtu),
|
||||||
.rq_psn = dest->psn,
|
.rq_psn = dest->psn,
|
||||||
.sq_psn = conn->addr.psn,
|
.sq_psn = conn->addr.psn,
|
||||||
.dest_qp_num = dest->qpn,
|
.dest_qp_num = dest->qpn,
|
||||||
|
@ -522,18 +505,19 @@ int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
|
||||||
.rnr_retry = 7,
|
.rnr_retry = 7,
|
||||||
};
|
};
|
||||||
// FIXME No idea if ibv_modify_qp is a blocking operation or not. No idea if it has a timeout and what it is.
|
// FIXME No idea if ibv_modify_qp is a blocking operation or not. No idea if it has a timeout and what it is.
|
||||||
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
|
int r;
|
||||||
IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER))
|
if ((r = ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
|
||||||
|
IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to switch RDMA queue pair to RTR (ready-to-receive) state\n");
|
fprintf(stderr, "Failed to switch RDMA queue pair to RTR (ready-to-receive) state: %s (code %d)\n", strerror(r), r);
|
||||||
return 1;
|
return -r;
|
||||||
}
|
}
|
||||||
attr.qp_state = IBV_QPS_RTS;
|
attr.qp_state = IBV_QPS_RTS;
|
||||||
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
|
if ((r = ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
|
||||||
IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC))
|
IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to switch RDMA queue pair to RTS (ready-to-send) state\n");
|
fprintf(stderr, "Failed to switch RDMA queue pair to RTS (ready-to-send) state: %s (code %d)\n", strerror(r), r);
|
||||||
return 1;
|
return -r;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -548,7 +532,15 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
|
||||||
{
|
{
|
||||||
client_max_msg = rdma_max_msg;
|
client_max_msg = rdma_max_msg;
|
||||||
}
|
}
|
||||||
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, client_max_msg);
|
auto cl = clients.at(peer_fd);
|
||||||
|
msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl);
|
||||||
|
if (!selected_ctx)
|
||||||
|
{
|
||||||
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "No RDMA context for peer %d, using only TCP\n", cl->peer_fd);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
msgr_rdma_connection_t *rdma_conn = msgr_rdma_connection_t::create(selected_ctx, rdma_max_send, rdma_max_recv, rdma_max_sge, client_max_msg);
|
||||||
if (rdma_conn)
|
if (rdma_conn)
|
||||||
{
|
{
|
||||||
int r = rdma_conn->connect(&addr);
|
int r = rdma_conn->connect(&addr);
|
||||||
|
@ -586,7 +578,7 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||||
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
|
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||||
if (err || bad_wr)
|
if (err || bad_wr)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
|
fprintf(stderr, "RDMA send failed: %s (code %d)\n", strerror(err), err);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
cl->rdma_conn->cur_send++;
|
cl->rdma_conn->cur_send++;
|
||||||
|
@ -667,9 +659,9 @@ void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
|
||||||
// Allocate send ring buffer, if not yet
|
// Allocate send ring buffer, if not yet
|
||||||
rc->send_out_size = rc->max_msg*rdma_max_send;
|
rc->send_out_size = rc->max_msg*rdma_max_send;
|
||||||
rc->send_out.buf = malloc_or_die(rc->send_out_size);
|
rc->send_out.buf = malloc_or_die(rc->send_out_size);
|
||||||
if (!rdma_context->odp)
|
if (!rc->ctx->odp)
|
||||||
{
|
{
|
||||||
rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
|
rc->send_out.mr = ibv_reg_mr(rc->ctx->pd, rc->send_out.buf, rc->send_out_size, 0);
|
||||||
if (!rc->send_out.mr)
|
if (!rc->send_out.mr)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
||||||
|
@ -699,7 +691,7 @@ void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
|
||||||
ibv_sge sge = {
|
ibv_sge sge = {
|
||||||
.addr = (uintptr_t)dst,
|
.addr = (uintptr_t)dst,
|
||||||
.length = (uint32_t)copied,
|
.length = (uint32_t)copied,
|
||||||
.lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
|
.lkey = rc->ctx->odp ? rc->ctx->mr->lkey : rc->send_out.mr->lkey,
|
||||||
};
|
};
|
||||||
try_send_rdma_wr(cl, &sge, 1);
|
try_send_rdma_wr(cl, &sge, 1);
|
||||||
rc->send_sizes.push_back(copied);
|
rc->send_sizes.push_back(copied);
|
||||||
|
@ -709,7 +701,7 @@ void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
|
||||||
|
|
||||||
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
if (rdma_context->odp)
|
if (cl->rdma_conn->ctx->odp)
|
||||||
try_send_rdma_odp(cl);
|
try_send_rdma_odp(cl);
|
||||||
else
|
else
|
||||||
try_send_rdma_nodp(cl);
|
try_send_rdma_nodp(cl);
|
||||||
|
@ -744,9 +736,9 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
msgr_rdma_buf_t b;
|
msgr_rdma_buf_t b;
|
||||||
b.buf = malloc_or_die(rc->max_msg);
|
b.buf = malloc_or_die(rc->max_msg);
|
||||||
if (!rdma_context->odp)
|
if (!rc->ctx->odp)
|
||||||
{
|
{
|
||||||
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
|
b.mr = ibv_reg_mr(rc->ctx->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
|
||||||
if (!b.mr)
|
if (!b.mr)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
||||||
|
@ -761,7 +753,7 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||||
|
|
||||||
#define RDMA_EVENTS_AT_ONCE 32
|
#define RDMA_EVENTS_AT_ONCE 32
|
||||||
|
|
||||||
void osd_messenger_t::handle_rdma_events()
|
void osd_messenger_t::handle_rdma_events(msgr_rdma_context_t *rdma_context)
|
||||||
{
|
{
|
||||||
// Request next notification
|
// Request next notification
|
||||||
ibv_cq *ev_cq;
|
ibv_cq *ev_cq;
|
||||||
|
|
|
@ -2,9 +2,13 @@
|
||||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
#include <rdma/rdma_cma.h>
|
||||||
|
#endif
|
||||||
#include <infiniband/verbs.h>
|
#include <infiniband/verbs.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include "addr_util.h"
|
||||||
|
|
||||||
struct msgr_rdma_address_t
|
struct msgr_rdma_address_t
|
||||||
{
|
{
|
||||||
|
@ -20,7 +24,6 @@ struct msgr_rdma_address_t
|
||||||
struct msgr_rdma_context_t
|
struct msgr_rdma_context_t
|
||||||
{
|
{
|
||||||
ibv_context *context = NULL;
|
ibv_context *context = NULL;
|
||||||
ibv_device *dev = NULL;
|
|
||||||
ibv_device_attr_ex attrx;
|
ibv_device_attr_ex attrx;
|
||||||
ibv_pd *pd = NULL;
|
ibv_pd *pd = NULL;
|
||||||
bool odp = false;
|
bool odp = false;
|
||||||
|
@ -35,8 +38,17 @@ struct msgr_rdma_context_t
|
||||||
uint32_t mtu;
|
uint32_t mtu;
|
||||||
int max_cqe = 0;
|
int max_cqe = 0;
|
||||||
int used_max_cqe = 0;
|
int used_max_cqe = 0;
|
||||||
|
addr_mask_t net_mask = {};
|
||||||
|
bool is_cm = false;
|
||||||
|
int cm_refs = 0;
|
||||||
|
|
||||||
|
static std::vector<msgr_rdma_context_t*> create_all(const std::vector<addr_mask_t> & osd_network_masks,
|
||||||
|
const char *sel_dev_name, int sel_port_num, int sel_gid_index, uint32_t sel_mtu, bool odp, int log_level);
|
||||||
|
static msgr_rdma_context_t *create(ibv_device *dev, ibv_port_attr & portinfo,
|
||||||
|
int ib_port, int gid_index, uint32_t mtu, bool odp, int log_level);
|
||||||
|
static msgr_rdma_context_t* create_cm(ibv_context *ctx);
|
||||||
|
bool reserve_cqe(int n);
|
||||||
|
|
||||||
static msgr_rdma_context_t *create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, int gid_index, uint32_t mtu, bool odp, int log_level);
|
|
||||||
~msgr_rdma_context_t();
|
~msgr_rdma_context_t();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -50,11 +62,14 @@ struct msgr_rdma_connection_t
|
||||||
{
|
{
|
||||||
msgr_rdma_context_t *ctx = NULL;
|
msgr_rdma_context_t *ctx = NULL;
|
||||||
ibv_qp *qp = NULL;
|
ibv_qp *qp = NULL;
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
|
rdma_cm_id *cmid = NULL;
|
||||||
|
#endif
|
||||||
msgr_rdma_address_t addr;
|
msgr_rdma_address_t addr;
|
||||||
int max_send = 0, max_recv = 0, max_sge = 0;
|
int max_send = 0, max_recv = 0, max_sge = 0;
|
||||||
int cur_send = 0, cur_recv = 0;
|
|
||||||
uint64_t max_msg = 0;
|
uint64_t max_msg = 0;
|
||||||
|
|
||||||
|
int cur_send = 0, cur_recv = 0;
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
int next_recv_buf = 0;
|
int next_recv_buf = 0;
|
||||||
std::vector<msgr_rdma_buf_t> recv_buffers;
|
std::vector<msgr_rdma_buf_t> recv_buffers;
|
||||||
|
|
|
@ -0,0 +1,526 @@
|
||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include "msgr_rdma.h"
|
||||||
|
#include "messenger.h"
|
||||||
|
|
||||||
|
struct rdmacm_connecting_t
|
||||||
|
{
|
||||||
|
rdma_cm_id *cmid = NULL;
|
||||||
|
int peer_fd = -1;
|
||||||
|
osd_num_t peer_osd = 0;
|
||||||
|
std::string addr;
|
||||||
|
sockaddr_storage parsed_addr = {};
|
||||||
|
int rdmacm_port = 0;
|
||||||
|
int tcp_port = 0;
|
||||||
|
int timeout_ms = 0;
|
||||||
|
int timeout_id = -1;
|
||||||
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
|
};
|
||||||
|
|
||||||
|
rdma_cm_id *osd_messenger_t::rdmacm_listen(const std::string & bind_address, int rdmacm_port, int *bound_port, int log_level)
|
||||||
|
{
|
||||||
|
sockaddr_storage addr = {};
|
||||||
|
rdma_cm_id *listener = NULL;
|
||||||
|
int r = rdma_create_id(rdmacm_evch, &listener, NULL, RDMA_PS_TCP);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to create RDMA-CM ID: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (!string_to_addr(bind_address, 0, rdmacm_port, &addr))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Server address: %s is not valid\n", bind_address.c_str());
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
r = rdma_bind_addr(listener, (sockaddr*)&addr);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to bind RDMA-CM to %s:%d: %s (code %d)\n", bind_address.c_str(), rdmacm_port, strerror(errno), errno);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
r = rdma_listen(listener, 128);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to listen to RDMA-CM address %s:%d: %s (code %d)\n", bind_address.c_str(), rdmacm_port, strerror(errno), errno);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (bound_port)
|
||||||
|
{
|
||||||
|
*bound_port = ntohs(rdma_get_src_port(listener));
|
||||||
|
}
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Listening to RDMA-CM address %s port %d\n", bind_address.c_str(), *bound_port);
|
||||||
|
}
|
||||||
|
return listener;
|
||||||
|
fail:
|
||||||
|
rdma_destroy_id(listener);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::rdmacm_destroy_listener(rdma_cm_id *listener)
|
||||||
|
{
|
||||||
|
rdma_destroy_id(listener);
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::handle_rdmacm_events()
|
||||||
|
{
|
||||||
|
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(
|
||||||
|
std::vector<rdma_cm_event> events_copy;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
rdma_cm_event *ev = NULL;
|
||||||
|
int r = rdma_get_cm_event(rdmacm_evch, &ev);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
if (errno == EAGAIN || errno == EINTR)
|
||||||
|
break;
|
||||||
|
fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
events_copy.push_back(*ev);
|
||||||
|
r = rdma_ack_cm_event(ev);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to ack (free) RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto & evl: events_copy)
|
||||||
|
{
|
||||||
|
auto ev = &evl;
|
||||||
|
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
||||||
|
{
|
||||||
|
rdmacm_accept(ev);
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
|
||||||
|
ev->event == RDMA_CM_EVENT_REJECTED ||
|
||||||
|
ev->event == RDMA_CM_EVENT_DISCONNECTED ||
|
||||||
|
ev->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
|
||||||
|
{
|
||||||
|
auto event_type_name = ev->event == RDMA_CM_EVENT_CONNECT_ERROR ? "RDMA_CM_EVENT_CONNECT_ERROR" : (
|
||||||
|
ev->event == RDMA_CM_EVENT_REJECTED ? "RDMA_CM_EVENT_REJECTED" : (
|
||||||
|
ev->event == RDMA_CM_EVENT_DISCONNECTED ? "RDMA_CM_EVENT_DISCONNECTED" : "RDMA_CM_EVENT_DEVICE_REMOVAL"));
|
||||||
|
auto cli_it = rdmacm_connections.find(ev->id);
|
||||||
|
if (cli_it != rdmacm_connections.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Received %s event for peer %d, closing connection\n",
|
||||||
|
event_type_name, cli_it->second->peer_fd);
|
||||||
|
stop_client(cli_it->second->peer_fd);
|
||||||
|
}
|
||||||
|
else if (rdmacm_connecting.find(ev->id) != rdmacm_connecting.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Received %s event for RDMA-CM OSD %ju connection\n",
|
||||||
|
event_type_name, rdmacm_connecting[ev->id]->peer_osd);
|
||||||
|
rdmacm_established(ev);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Received %s event for an unknown RDMA-CM connection 0x%jx - ignoring\n",
|
||||||
|
event_type_name, (uint64_t)ev->id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_ADDR_RESOLVED || ev->event == RDMA_CM_EVENT_ADDR_ERROR)
|
||||||
|
{
|
||||||
|
rdmacm_address_resolved(ev);
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_ROUTE_RESOLVED || ev->event == RDMA_CM_EVENT_ROUTE_ERROR)
|
||||||
|
{
|
||||||
|
rdmacm_route_resolved(ev);
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_CONNECT_RESPONSE)
|
||||||
|
{
|
||||||
|
// Just OK
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_UNREACHABLE || ev->event == RDMA_CM_EVENT_REJECTED)
|
||||||
|
{
|
||||||
|
// Handle error
|
||||||
|
rdmacm_established(ev);
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_ESTABLISHED)
|
||||||
|
{
|
||||||
|
rdmacm_established(ev);
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_ADDR_CHANGE || ev->event == RDMA_CM_EVENT_TIMEWAIT_EXIT)
|
||||||
|
{
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Other events are unexpected
|
||||||
|
fprintf(stderr, "Unexpected RDMA-CM event type: %d\n", ev->event);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_context_t* msgr_rdma_context_t::create_cm(ibv_context *ctx)
|
||||||
|
{
|
||||||
|
auto rdma_context = new msgr_rdma_context_t;
|
||||||
|
rdma_context->is_cm = true;
|
||||||
|
rdma_context->context = ctx;
|
||||||
|
rdma_context->pd = ibv_alloc_pd(ctx);
|
||||||
|
if (!rdma_context->pd)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't allocate RDMA protection domain\n");
|
||||||
|
delete rdma_context;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
rdma_context->odp = false;
|
||||||
|
rdma_context->channel = ibv_create_comp_channel(rdma_context->context);
|
||||||
|
if (!rdma_context->channel)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't create RDMA completion channel\n");
|
||||||
|
delete rdma_context;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
rdma_context->max_cqe = 4096;
|
||||||
|
rdma_context->cq = ibv_create_cq(rdma_context->context, rdma_context->max_cqe, NULL, rdma_context->channel, 0);
|
||||||
|
if (!rdma_context->cq)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't create RDMA completion queue\n");
|
||||||
|
delete rdma_context;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (ibv_query_device_ex(rdma_context->context, NULL, &rdma_context->attrx))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||||
|
delete rdma_context;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return rdma_context;
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_context_t* osd_messenger_t::rdmacm_get_context(ibv_context *verbs)
|
||||||
|
{
|
||||||
|
// Find the context by device
|
||||||
|
// We assume that RDMA_CM ev->id->verbs is always the same for the same device (but PD for example isn't)
|
||||||
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
|
for (auto ctx: rdma_contexts)
|
||||||
|
{
|
||||||
|
if (ctx->context == verbs)
|
||||||
|
{
|
||||||
|
rdma_context = ctx;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!rdma_context)
|
||||||
|
{
|
||||||
|
// Wrap into a new msgr_rdma_context_t
|
||||||
|
rdma_context = msgr_rdma_context_t::create_cm(verbs);
|
||||||
|
if (!rdma_context)
|
||||||
|
return NULL;
|
||||||
|
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
|
tfd->set_fd_handler(rdma_context->channel->fd, false, [this, rdma_context](int notify_fd, int epoll_events)
|
||||||
|
{
|
||||||
|
handle_rdma_events(rdma_context);
|
||||||
|
});
|
||||||
|
handle_rdma_events(rdma_context);
|
||||||
|
rdma_contexts.push_back(rdma_context);
|
||||||
|
}
|
||||||
|
return rdma_context;
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_context_t* osd_messenger_t::rdmacm_create_qp(rdma_cm_id *cmid)
|
||||||
|
{
|
||||||
|
auto rdma_context = rdmacm_get_context(cmid->verbs);
|
||||||
|
if (!rdma_context)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
rdma_context->reserve_cqe(rdma_max_send+rdma_max_recv);
|
||||||
|
auto max_sge = rdma_max_sge > rdma_context->attrx.orig_attr.max_sge
|
||||||
|
? rdma_context->attrx.orig_attr.max_sge : rdma_max_sge;
|
||||||
|
ibv_qp_init_attr init_attr = {
|
||||||
|
.send_cq = rdma_context->cq,
|
||||||
|
.recv_cq = rdma_context->cq,
|
||||||
|
.cap = {
|
||||||
|
.max_send_wr = (uint32_t)rdma_max_send,
|
||||||
|
.max_recv_wr = (uint32_t)rdma_max_recv,
|
||||||
|
.max_send_sge = (uint32_t)max_sge,
|
||||||
|
.max_recv_sge = (uint32_t)max_sge,
|
||||||
|
},
|
||||||
|
.qp_type = IBV_QPT_RC,
|
||||||
|
};
|
||||||
|
int r = rdma_create_qp(cmid, rdma_context->pd, &init_attr);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to create a queue pair via RDMA-CM: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
rdma_context->reserve_cqe(-rdma_max_send-rdma_max_recv);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return rdma_context;
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::rdmacm_accept(rdma_cm_event *ev)
|
||||||
|
{
|
||||||
|
// Make a fake FD (FIXME: do not use FDs for identifying clients!)
|
||||||
|
int fake_fd = socket(AF_INET, SOCK_STREAM, 0);
|
||||||
|
if (fake_fd < 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to allocate a fake socket for RDMA-CM client: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
rdma_destroy_id(ev->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto rdma_context = rdmacm_create_qp(ev->id);
|
||||||
|
if (!rdma_context)
|
||||||
|
{
|
||||||
|
rdma_destroy_id(ev->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// We don't need private_data, RDMA_READ or ATOMIC so use default 1
|
||||||
|
rdma_conn_param conn_params = {
|
||||||
|
.responder_resources = 1,
|
||||||
|
.initiator_depth = 1,
|
||||||
|
.retry_count = 7,
|
||||||
|
.rnr_retry_count = 7,
|
||||||
|
};
|
||||||
|
if (rdma_accept(ev->id, &conn_params) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to accept RDMA-CM connection: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
rdma_context->reserve_cqe(-rdma_max_send-rdma_max_recv);
|
||||||
|
rdma_destroy_qp(ev->id);
|
||||||
|
rdma_destroy_id(ev->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rdma_context->cm_refs++;
|
||||||
|
// Wrap into a new msgr_rdma_connection_t
|
||||||
|
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
|
||||||
|
conn->ctx = rdma_context;
|
||||||
|
conn->max_send = rdma_max_send;
|
||||||
|
conn->max_recv = rdma_max_recv;
|
||||||
|
conn->max_sge = rdma_max_sge > rdma_context->attrx.orig_attr.max_sge
|
||||||
|
? rdma_context->attrx.orig_attr.max_sge : rdma_max_sge;
|
||||||
|
conn->max_msg = rdma_max_msg;
|
||||||
|
conn->cmid = ev->id;
|
||||||
|
conn->qp = ev->id->qp;
|
||||||
|
auto cl = new osd_client_t();
|
||||||
|
cl->peer_fd = fake_fd;
|
||||||
|
cl->peer_state = PEER_RDMA;
|
||||||
|
cl->peer_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
|
||||||
|
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||||
|
cl->rdma_conn = conn;
|
||||||
|
clients[fake_fd] = cl;
|
||||||
|
rdmacm_connections[ev->id] = cl;
|
||||||
|
// Add initial receive request(s)
|
||||||
|
try_recv_rdma(cl);
|
||||||
|
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, fake_fd,
|
||||||
|
addr_to_string(cl->peer_addr).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
|
||||||
|
{
|
||||||
|
auto conn = rdmacm_connecting.at(cmid);
|
||||||
|
auto addr = conn->addr;
|
||||||
|
auto tcp_port = conn->tcp_port;
|
||||||
|
auto peer_osd = conn->peer_osd;
|
||||||
|
if (conn->timeout_id >= 0)
|
||||||
|
tfd->clear_timer(conn->timeout_id);
|
||||||
|
if (conn->peer_fd >= 0)
|
||||||
|
close(conn->peer_fd);
|
||||||
|
if (conn->rdma_context)
|
||||||
|
conn->rdma_context->reserve_cqe(-rdma_max_send-rdma_max_recv);
|
||||||
|
if (conn->cmid)
|
||||||
|
{
|
||||||
|
if (conn->cmid->qp)
|
||||||
|
rdma_destroy_qp(conn->cmid);
|
||||||
|
rdma_destroy_id(conn->cmid);
|
||||||
|
}
|
||||||
|
rdmacm_connecting.erase(cmid);
|
||||||
|
delete conn;
|
||||||
|
if (!disable_tcp)
|
||||||
|
{
|
||||||
|
// Fall back to TCP instead of just reporting the error to on_connect_peer()
|
||||||
|
try_connect_peer_tcp(peer_osd, addr.c_str(), tcp_port);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// TCP is disabled
|
||||||
|
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port)
|
||||||
|
{
|
||||||
|
struct sockaddr_storage sa = {};
|
||||||
|
if (!string_to_addr(addr, false, rdmacm_port, &sa))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Address %s is invalid\n", addr.c_str());
|
||||||
|
on_connect_peer(peer_osd, -EINVAL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rdma_cm_id *cmid = NULL;
|
||||||
|
if (rdma_create_id(rdmacm_evch, &cmid, NULL, RDMA_PS_TCP) != 0)
|
||||||
|
{
|
||||||
|
int res = -errno;
|
||||||
|
fprintf(stderr, "Failed to create RDMA-CM ID: %s (code %d), using TCP\n", strerror(errno), errno);
|
||||||
|
if (!disable_tcp)
|
||||||
|
try_connect_peer_tcp(peer_osd, addr.c_str(), fallback_tcp_port);
|
||||||
|
else
|
||||||
|
on_connect_peer(peer_osd, res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Make a fake FD (FIXME: do not use FDs for identifying clients!)
|
||||||
|
int fake_fd = socket(AF_INET, SOCK_STREAM, 0);
|
||||||
|
if (fake_fd < 0)
|
||||||
|
{
|
||||||
|
int res = -errno;
|
||||||
|
rdma_destroy_id(cmid);
|
||||||
|
// Can't create socket, pointless to try TCP
|
||||||
|
on_connect_peer(peer_osd, res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto conn = new rdmacm_connecting_t;
|
||||||
|
rdmacm_connecting[cmid] = conn;
|
||||||
|
conn->cmid = cmid;
|
||||||
|
conn->peer_fd = fake_fd;
|
||||||
|
conn->peer_osd = peer_osd;
|
||||||
|
conn->addr = addr;
|
||||||
|
conn->parsed_addr = sa;
|
||||||
|
conn->rdmacm_port = rdmacm_port;
|
||||||
|
conn->tcp_port = fallback_tcp_port;
|
||||||
|
conn->timeout_ms = peer_connect_timeout*1000;
|
||||||
|
conn->timeout_id = -1;
|
||||||
|
if (peer_connect_timeout > 0)
|
||||||
|
{
|
||||||
|
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid](int timer_id)
|
||||||
|
{
|
||||||
|
auto conn = rdmacm_connecting.at(cmid);
|
||||||
|
conn->timeout_id = -1;
|
||||||
|
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
|
||||||
|
rdmacm_on_connect_peer_error(cmid, -EPIPE);
|
||||||
|
return;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0)
|
||||||
|
{
|
||||||
|
auto res = -errno;
|
||||||
|
// ENODEV means that the client doesn't have an RDMA device for this address
|
||||||
|
if (res != -ENODEV || log_level > 0)
|
||||||
|
fprintf(stderr, "Failed to resolve address %s via RDMA-CM: %s (code %d)\n", addr.c_str(), strerror(errno), errno);
|
||||||
|
rdmacm_on_connect_peer_error(cmid, res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::rdmacm_address_resolved(rdma_cm_event *ev)
|
||||||
|
{
|
||||||
|
auto cmid = ev->id;
|
||||||
|
auto conn_it = rdmacm_connecting.find(cmid);
|
||||||
|
if (conn_it == rdmacm_connecting.end())
|
||||||
|
{
|
||||||
|
// Silently ignore unknown IDs
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto conn = conn_it->second;
|
||||||
|
if (ev->event != RDMA_CM_EVENT_ADDR_RESOLVED || ev->status != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to resolve address %s via RDMA-CM: %s (code %d)\n", conn->addr.c_str(),
|
||||||
|
ev->status > 0 ? "unknown error" : strerror(-ev->status), ev->status);
|
||||||
|
rdmacm_on_connect_peer_error(cmid, ev->status);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto rdma_context = rdmacm_create_qp(cmid);
|
||||||
|
if (!rdma_context)
|
||||||
|
{
|
||||||
|
rdmacm_on_connect_peer_error(cmid, -EIO);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
conn->rdma_context = rdma_context;
|
||||||
|
if (rdma_resolve_route(cmid, conn->timeout_ms) != 0)
|
||||||
|
{
|
||||||
|
int res = -errno;
|
||||||
|
fprintf(stderr, "Failed to resolve route to %s via RDMA-CM: %s (code %d)\n", conn->addr.c_str(), strerror(errno), errno);
|
||||||
|
rdmacm_on_connect_peer_error(cmid, res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::rdmacm_route_resolved(rdma_cm_event *ev)
|
||||||
|
{
|
||||||
|
auto cmid = ev->id;
|
||||||
|
auto conn_it = rdmacm_connecting.find(cmid);
|
||||||
|
if (conn_it == rdmacm_connecting.end())
|
||||||
|
{
|
||||||
|
// Silently ignore unknown IDs
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto conn = conn_it->second;
|
||||||
|
if (ev->event != RDMA_CM_EVENT_ROUTE_RESOLVED || ev->status != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to resolve route to %s via RDMA-CM: %s (code %d)\n", conn->addr.c_str(),
|
||||||
|
ev->status > 0 ? "unknown error" : strerror(-ev->status), ev->status);
|
||||||
|
rdmacm_on_connect_peer_error(cmid, ev->status);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// We don't need private_data, RDMA_READ or ATOMIC so use default 1
|
||||||
|
rdma_conn_param conn_params = {
|
||||||
|
.responder_resources = 1,
|
||||||
|
.initiator_depth = 1,
|
||||||
|
.retry_count = 7,
|
||||||
|
.rnr_retry_count = 7,
|
||||||
|
};
|
||||||
|
if (rdma_connect(cmid, &conn_params) != 0)
|
||||||
|
{
|
||||||
|
int res = -errno;
|
||||||
|
fprintf(stderr, "Failed to connect to %s:%d via RDMA-CM: %s (code %d)\n", conn->addr.c_str(), conn->rdmacm_port, strerror(errno), errno);
|
||||||
|
rdmacm_on_connect_peer_error(cmid, res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
|
||||||
|
{
|
||||||
|
auto cmid = ev->id;
|
||||||
|
auto conn_it = rdmacm_connecting.find(cmid);
|
||||||
|
if (conn_it == rdmacm_connecting.end())
|
||||||
|
{
|
||||||
|
// Silently ignore unknown IDs
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto conn = conn_it->second;
|
||||||
|
auto peer_osd = conn->peer_osd;
|
||||||
|
if (ev->event != RDMA_CM_EVENT_ESTABLISHED || ev->status != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to connect to %s:%d via RDMA-CM: %s (code %d)\n", conn->addr.c_str(), conn->rdmacm_port,
|
||||||
|
ev->status > 0 ? "unknown error" : strerror(-ev->status), ev->status);
|
||||||
|
rdmacm_on_connect_peer_error(cmid, ev->status);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Wrap into a new msgr_rdma_connection_t
|
||||||
|
msgr_rdma_connection_t *rc = new msgr_rdma_connection_t;
|
||||||
|
rc->ctx = conn->rdma_context;
|
||||||
|
rc->ctx->cm_refs++;
|
||||||
|
rc->max_send = rdma_max_send;
|
||||||
|
rc->max_recv = rdma_max_recv;
|
||||||
|
rc->max_sge = rdma_max_sge > rc->ctx->attrx.orig_attr.max_sge
|
||||||
|
? rc->ctx->attrx.orig_attr.max_sge : rdma_max_sge;
|
||||||
|
rc->max_msg = rdma_max_msg;
|
||||||
|
rc->cmid = conn->cmid;
|
||||||
|
rc->qp = conn->cmid->qp;
|
||||||
|
// And an osd_client_t
|
||||||
|
auto cl = new osd_client_t();
|
||||||
|
cl->peer_addr = conn->parsed_addr;
|
||||||
|
cl->peer_port = conn->rdmacm_port;
|
||||||
|
cl->peer_fd = conn->peer_fd;
|
||||||
|
cl->peer_state = PEER_RDMA;
|
||||||
|
cl->connect_timeout_id = -1;
|
||||||
|
cl->osd_num = peer_osd;
|
||||||
|
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||||
|
cl->rdma_conn = rc;
|
||||||
|
clients[conn->peer_fd] = cl;
|
||||||
|
if (conn->timeout_id >= 0)
|
||||||
|
tfd->clear_timer(conn->timeout_id);
|
||||||
|
delete conn;
|
||||||
|
rdmacm_connecting.erase(cmid);
|
||||||
|
rdmacm_connections[cmid] = cl;
|
||||||
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "Successfully connected with OSD %ju using RDMA-CM\n", peer_osd);
|
||||||
|
// Add initial receive request(s)
|
||||||
|
try_recv_rdma(cl);
|
||||||
|
check_peer_config(cl);
|
||||||
|
}
|
|
@ -187,6 +187,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
assert(cl->peer_state != PEER_RDMA);
|
||||||
if (ringloop && !use_sync_send_recv)
|
if (ringloop && !use_sync_send_recv)
|
||||||
{
|
{
|
||||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||||
|
|
|
@ -1194,7 +1194,7 @@ protected:
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (cur_op->opcode == OSD_OP_WRITE && watch->cfg.readonly)
|
if (cur_op->opcode == OSD_OP_WRITE && !inode && watch->cfg.readonly)
|
||||||
{
|
{
|
||||||
cur_op->retval = -EROFS;
|
cur_op->retval = -EROFS;
|
||||||
std::function<void(cluster_op_t*)>(cur_op->callback)(cur_op);
|
std::function<void(cluster_op_t*)>(cur_op->callback)(cur_op);
|
||||||
|
|
|
@ -294,7 +294,9 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
|
||||||
|
|
||||||
qemu_mutex_lock(&client->mutex);
|
qemu_mutex_lock(&client->mutex);
|
||||||
vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
|
vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
vitastor_schedule_uring_handler(client);
|
vitastor_schedule_uring_handler(client);
|
||||||
|
#endif
|
||||||
qemu_mutex_unlock(&client->mutex);
|
qemu_mutex_unlock(&client->mutex);
|
||||||
|
|
||||||
while (!task->complete)
|
while (!task->complete)
|
||||||
|
@ -566,6 +568,22 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||||
static void vitastor_close(BlockDriverState *bs)
|
static void vitastor_close(BlockDriverState *bs)
|
||||||
{
|
{
|
||||||
VitastorClient *client = bs->opaque;
|
VitastorClient *client = bs->opaque;
|
||||||
|
if (client->uring_eventfd >= 0)
|
||||||
|
{
|
||||||
|
// clear the eventfd handler
|
||||||
|
universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, NULL, NULL, NULL);
|
||||||
|
int wait_bh = 0;
|
||||||
|
qemu_mutex_lock(&client->mutex);
|
||||||
|
// clear uring_eventfd itself to prevent future scheduling of new B/H
|
||||||
|
client->uring_eventfd = -1;
|
||||||
|
wait_bh = client->bh_uring_scheduled;
|
||||||
|
qemu_mutex_unlock(&client->mutex);
|
||||||
|
if (wait_bh)
|
||||||
|
{
|
||||||
|
// wait until existing scheduled B/H is ran
|
||||||
|
BDRV_POLL_WHILE(bs, client->bh_uring_scheduled);
|
||||||
|
}
|
||||||
|
}
|
||||||
vitastor_c_destroy(client->proxy);
|
vitastor_c_destroy(client->proxy);
|
||||||
if (client->fds)
|
if (client->fds)
|
||||||
{
|
{
|
||||||
|
@ -749,7 +767,9 @@ static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
|
||||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||||
qemu_mutex_lock(&client->mutex);
|
qemu_mutex_lock(&client->mutex);
|
||||||
vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
|
vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
vitastor_schedule_uring_handler(client);
|
vitastor_schedule_uring_handler(client);
|
||||||
|
#endif
|
||||||
qemu_mutex_unlock(&client->mutex);
|
qemu_mutex_unlock(&client->mutex);
|
||||||
|
|
||||||
while (!task.complete)
|
while (!task.complete)
|
||||||
|
@ -783,7 +803,9 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
||||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||||
qemu_mutex_lock(&client->mutex);
|
qemu_mutex_lock(&client->mutex);
|
||||||
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
|
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
vitastor_schedule_uring_handler(client);
|
vitastor_schedule_uring_handler(client);
|
||||||
|
#endif
|
||||||
qemu_mutex_unlock(&client->mutex);
|
qemu_mutex_unlock(&client->mutex);
|
||||||
|
|
||||||
while (!task.complete)
|
while (!task.complete)
|
||||||
|
@ -863,7 +885,9 @@ static int coroutine_fn vitastor_co_block_status(
|
||||||
task.bitmap = client->last_bitmap = NULL;
|
task.bitmap = client->last_bitmap = NULL;
|
||||||
qemu_mutex_lock(&client->mutex);
|
qemu_mutex_lock(&client->mutex);
|
||||||
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
vitastor_schedule_uring_handler(client);
|
vitastor_schedule_uring_handler(client);
|
||||||
|
#endif
|
||||||
qemu_mutex_unlock(&client->mutex);
|
qemu_mutex_unlock(&client->mutex);
|
||||||
while (!task.complete)
|
while (!task.complete)
|
||||||
{
|
{
|
||||||
|
@ -950,7 +974,9 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
|
||||||
|
|
||||||
qemu_mutex_lock(&client->mutex);
|
qemu_mutex_lock(&client->mutex);
|
||||||
vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
|
vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
|
||||||
|
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||||
vitastor_schedule_uring_handler(client);
|
vitastor_schedule_uring_handler(client);
|
||||||
|
#endif
|
||||||
qemu_mutex_unlock(&client->mutex);
|
qemu_mutex_unlock(&client->mutex);
|
||||||
|
|
||||||
while (!task.complete)
|
while (!task.complete)
|
||||||
|
|
|
@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 1.11.0
|
Version: 2.1.0
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
|
|
@ -127,6 +127,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
|
||||||
auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
|
auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
|
||||||
self->ringloop = ringloop;
|
self->ringloop = ringloop;
|
||||||
self->cli = new cluster_client_t(self->ringloop, self->tfd, cfg_json);
|
self->cli = new cluster_client_t(self->ringloop, self->tfd, cfg_json);
|
||||||
|
ringloop->loop();
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -150,6 +151,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
|
||||||
self->ringloop = ringloop;
|
self->ringloop = ringloop;
|
||||||
self->epmgr = new epoll_manager_t(self->ringloop);
|
self->epmgr = new epoll_manager_t(self->ringloop);
|
||||||
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
|
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
|
||||||
|
ringloop->loop();
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -183,6 +185,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
|
||||||
self->ringloop = ringloop;
|
self->ringloop = ringloop;
|
||||||
self->epmgr = new epoll_manager_t(self->ringloop);
|
self->epmgr = new epoll_manager_t(self->ringloop);
|
||||||
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
|
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
|
||||||
|
ringloop->loop();
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -228,6 +231,10 @@ void vitastor_c_on_ready(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||||
{
|
{
|
||||||
cb(opaque, 0);
|
cb(opaque, 0);
|
||||||
});
|
});
|
||||||
|
if (client->ringloop)
|
||||||
|
{
|
||||||
|
client->ringloop->loop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vitastor_c_uring_wait_ready(vitastor_c *client)
|
void vitastor_c_uring_wait_ready(vitastor_c *client)
|
||||||
|
@ -284,6 +291,10 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
client->cli->execute(op);
|
client->cli->execute(op);
|
||||||
|
if (client->ringloop)
|
||||||
|
{
|
||||||
|
client->ringloop->loop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||||
|
@ -305,6 +316,10 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
client->cli->execute(op);
|
client->cli->execute(op);
|
||||||
|
if (client->ringloop)
|
||||||
|
{
|
||||||
|
client->ringloop->loop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vitastor_c_delete(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
void vitastor_c_delete(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||||
|
@ -322,6 +337,10 @@ void vitastor_c_delete(vitastor_c *client, uint64_t inode, uint64_t offset, uint
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
client->cli->execute(op);
|
client->cli->execute(op);
|
||||||
|
if (client->ringloop)
|
||||||
|
{
|
||||||
|
client->ringloop->loop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||||
|
@ -344,6 +363,10 @@ void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset,
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
client->cli->execute(op);
|
client->cli->execute(op);
|
||||||
|
if (client->ringloop)
|
||||||
|
{
|
||||||
|
client->ringloop->loop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||||
|
@ -356,6 +379,10 @@ void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
client->cli->execute(op);
|
client->cli->execute(op);
|
||||||
|
if (client->ringloop)
|
||||||
|
{
|
||||||
|
client->ringloop->loop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque)
|
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque)
|
||||||
|
@ -365,6 +392,10 @@ void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler c
|
||||||
auto watch = client->cli->st_cli.watch_inode(std::string(image));
|
auto watch = client->cli->st_cli.watch_inode(std::string(image));
|
||||||
cb(opaque, (long)watch);
|
cb(opaque, (long)watch);
|
||||||
});
|
});
|
||||||
|
if (client->ringloop)
|
||||||
|
{
|
||||||
|
client->ringloop->loop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vitastor_c_close_watch(vitastor_c *client, void *handle)
|
void vitastor_c_close_watch(vitastor_c *client, void *handle)
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#define VITASTOR_QEMU_PROXY_H
|
#define VITASTOR_QEMU_PROXY_H
|
||||||
|
|
||||||
// C API wrapper version
|
// C API wrapper version
|
||||||
#define VITASTOR_C_API_VERSION 4
|
#define VITASTOR_C_API_VERSION 5
|
||||||
|
|
||||||
#ifndef POOL_ID_BITS
|
#ifndef POOL_ID_BITS
|
||||||
#define POOL_ID_BITS 16
|
#define POOL_ID_BITS 16
|
||||||
|
|
|
@ -160,11 +160,12 @@ static const char* help_text =
|
||||||
"vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>\n"
|
"vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>\n"
|
||||||
" Set OSD reweight, tags or noout flag.\n"
|
" Set OSD reweight, tags or noout flag.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]\n"
|
"vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs|pgs [OPTIONS] [state1+state2] [^state3] [...]\n"
|
||||||
" List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:\n"
|
" List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:\n"
|
||||||
" --pool <pool name or number> Only list PGs of the given pool.\n"
|
" --pool <pool name or number> Only list PGs of the given pool.\n"
|
||||||
" --min <min pg number> Only list PGs with number >= min.\n"
|
" --min <min pg number> Only list PGs with number >= min.\n"
|
||||||
" --max <max pg number> Only list PGs with number <= max.\n"
|
" --max <max pg number> Only list PGs with number <= max.\n"
|
||||||
|
" --osd 1,2,... Only list PGs with some data on specified OSD(s).\n"
|
||||||
" Examples:\n"
|
" Examples:\n"
|
||||||
" vitastor-cli pg-list active+degraded\n"
|
" vitastor-cli pg-list active+degraded\n"
|
||||||
" vitastor-cli pg-list ^active\n"
|
" vitastor-cli pg-list ^active\n"
|
||||||
|
@ -186,7 +187,8 @@ static const char* help_text =
|
||||||
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
||||||
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
||||||
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
||||||
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
" --used_for_app fs:<name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
||||||
|
" --used_for_app s3:<name> Mark pool as used for S3 location with name <name>\n"
|
||||||
" --pg_stripe_size <number> Increase object grouping stripe\n"
|
" --pg_stripe_size <number> Increase object grouping stripe\n"
|
||||||
" --max_osd_combinations 10000 Maximum number of random combinations for LP solver input\n"
|
" --max_osd_combinations 10000 Maximum number of random combinations for LP solver input\n"
|
||||||
" --wait Wait for the new pool to come online\n"
|
" --wait Wait for the new pool to come online\n"
|
||||||
|
@ -198,7 +200,7 @@ static const char* help_text =
|
||||||
"vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]\n"
|
"vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]\n"
|
||||||
" Modify an existing pool. Modifiable parameters:\n"
|
" Modify an existing pool. Modifiable parameters:\n"
|
||||||
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
|
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
|
||||||
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
|
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_app <type>:<name>]\n"
|
||||||
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
|
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
|
||||||
" [--level_placement <rules>] [--raw_placement <rules>]\n"
|
" [--level_placement <rules>] [--raw_placement <rules>]\n"
|
||||||
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
|
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
|
||||||
|
@ -482,7 +484,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||||
cfg["osd_num"] = cmd[1];
|
cfg["osd_num"] = cmd[1];
|
||||||
action_cb = p->start_modify_osd(cfg);
|
action_cb = p->start_modify_osd(cfg);
|
||||||
}
|
}
|
||||||
else if (cmd[0] == "pg-list" || cmd[0] == "pg-ls" || cmd[0] == "list-pg" || cmd[0] == "ls-pg" || cmd[0] == "ls-pgs")
|
else if (cmd[0] == "pg-list" || cmd[0] == "pg-ls" || cmd[0] == "list-pg" || cmd[0] == "ls-pg" || cmd[0] == "ls-pgs" || cmd[0] == "pgs")
|
||||||
{
|
{
|
||||||
// Modify OSD configuration
|
// Modify OSD configuration
|
||||||
if (cmd.size() > 1)
|
if (cmd.size() > 1)
|
||||||
|
|
|
@ -98,5 +98,3 @@ std::string format_lat(uint64_t lat);
|
||||||
std::string format_q(double depth);
|
std::string format_q(double depth);
|
||||||
|
|
||||||
bool stupid_glob(const std::string str, const std::string glob);
|
bool stupid_glob(const std::string str, const std::string glob);
|
||||||
|
|
||||||
std::string implode(const std::string & sep, json11::Json array);
|
|
||||||
|
|
|
@ -92,12 +92,12 @@ struct image_creator_t
|
||||||
{
|
{
|
||||||
new_pool_id = pools.begin()->first;
|
new_pool_id = pools.begin()->first;
|
||||||
}
|
}
|
||||||
if (new_pool_id && !pools.at(new_pool_id).used_for_fs.empty() && !force)
|
if (new_pool_id && !pools.at(new_pool_id).used_for_app.empty() && !force)
|
||||||
{
|
{
|
||||||
result = (cli_result_t){
|
result = (cli_result_t){
|
||||||
.err = EINVAL,
|
.err = EINVAL,
|
||||||
.text = "Pool "+pools.at(new_pool_id).name+
|
.text = "Pool "+pools.at(new_pool_id).name+
|
||||||
" is used for VitastorFS "+pools.at(new_pool_id).used_for_fs+
|
" is used for application "+pools.at(new_pool_id).used_for_app+
|
||||||
". Use --force if you really know what you are doing",
|
". Use --force if you really know what you are doing",
|
||||||
};
|
};
|
||||||
state = 100;
|
state = 100;
|
||||||
|
|
|
@ -58,6 +58,12 @@ struct osd_changer_t
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (set_reweight && new_reweight > 1)
|
||||||
|
{
|
||||||
|
result = (cli_result_t){ .err = EINVAL, .text = "Reweight can't be larger than 1" };
|
||||||
|
state = 100;
|
||||||
|
return;
|
||||||
|
}
|
||||||
parent->etcd_txn(json11::Json::object {
|
parent->etcd_txn(json11::Json::object {
|
||||||
{ "success", json11::Json::array {
|
{ "success", json11::Json::array {
|
||||||
json11::Json::object {
|
json11::Json::object {
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include "epoll_manager.h"
|
#include "epoll_manager.h"
|
||||||
#include "pg_states.h"
|
#include "pg_states.h"
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
#include "json_util.h"
|
||||||
|
|
||||||
struct placement_osd_t
|
struct placement_osd_t
|
||||||
{
|
{
|
||||||
|
|
|
@ -5,12 +5,14 @@
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
#include "pg_states.h"
|
#include "pg_states.h"
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
#include "json_util.h"
|
||||||
|
|
||||||
struct pg_lister_t
|
struct pg_lister_t
|
||||||
{
|
{
|
||||||
cli_tool_t *parent;
|
cli_tool_t *parent;
|
||||||
|
|
||||||
uint64_t pool_id = 0;
|
uint64_t pool_id = 0;
|
||||||
|
std::set<osd_num_t> osd_nums;
|
||||||
std::string pool_name;
|
std::string pool_name;
|
||||||
std::vector<std::string> pg_state;
|
std::vector<std::string> pg_state;
|
||||||
uint64_t min_pg_num = 0;
|
uint64_t min_pg_num = 0;
|
||||||
|
@ -103,7 +105,7 @@ resume_1:
|
||||||
{
|
{
|
||||||
if (pg_state_names[i] == bit)
|
if (pg_state_names[i] == bit)
|
||||||
{
|
{
|
||||||
mask |= (uint64_t)1 << i;
|
mask |= pg_state_bits[i];
|
||||||
found = true;
|
found = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -137,6 +139,22 @@ resume_1:
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (osd_nums.size())
|
||||||
|
{
|
||||||
|
bool found = false;
|
||||||
|
for (int i = 0; !found && i < pgp.second.target_set.size(); i++)
|
||||||
|
if (osd_nums.find(pgp.second.target_set[i]) != osd_nums.end())
|
||||||
|
found = true;
|
||||||
|
for (int i = 0; !found && i < pgp.second.target_history.size(); i++)
|
||||||
|
for (int j = 0; !found && j < pgp.second.target_history[i].size(); j++)
|
||||||
|
if (osd_nums.find(pgp.second.target_history[i][j]) != osd_nums.end())
|
||||||
|
found = true;
|
||||||
|
for (int i = 0; !found && i < pgp.second.all_peers.size(); i++)
|
||||||
|
if (osd_nums.find(pgp.second.all_peers[i]) != osd_nums.end())
|
||||||
|
found = true;
|
||||||
|
if (!found)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (masks.size())
|
if (masks.size())
|
||||||
{
|
{
|
||||||
bool found = false;
|
bool found = false;
|
||||||
|
@ -156,7 +174,7 @@ resume_1:
|
||||||
json11::Json::array state_names;
|
json11::Json::array state_names;
|
||||||
for (int i = 0; i < pg_state_bit_count; i++)
|
for (int i = 0; i < pg_state_bit_count; i++)
|
||||||
{
|
{
|
||||||
if (pgp.second.cur_state & (1 << i))
|
if (pgp.second.cur_state & pg_state_bits[i])
|
||||||
{
|
{
|
||||||
state_names.push_back(std::string(pg_state_names[i]));
|
state_names.push_back(std::string(pg_state_names[i]));
|
||||||
}
|
}
|
||||||
|
@ -274,6 +292,14 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_pg_list(json11::Json cfg)
|
||||||
pg_lister->pg_state.push_back(cfg["pg_state"].string_value());
|
pg_lister->pg_state.push_back(cfg["pg_state"].string_value());
|
||||||
pg_lister->min_pg_num = cfg["min"].uint64_value();
|
pg_lister->min_pg_num = cfg["min"].uint64_value();
|
||||||
pg_lister->max_pg_num = cfg["max"].uint64_value();
|
pg_lister->max_pg_num = cfg["max"].uint64_value();
|
||||||
|
if (cfg["osd"].is_array())
|
||||||
|
for (auto & osd_num_json: cfg["osd"].array_items())
|
||||||
|
pg_lister->osd_nums.insert(osd_num_json.uint64_value());
|
||||||
|
else if (cfg["osd"].is_string())
|
||||||
|
for (auto & osd_num_str: explode(",", cfg["osd"].string_value(), true))
|
||||||
|
pg_lister->osd_nums.insert(stoull_full(osd_num_str));
|
||||||
|
else if (cfg["osd"].uint64_value())
|
||||||
|
pg_lister->osd_nums.insert(cfg["osd"].uint64_value());
|
||||||
return [pg_lister](cli_result_t & result)
|
return [pg_lister](cli_result_t & result)
|
||||||
{
|
{
|
||||||
pg_lister->loop();
|
pg_lister->loop();
|
||||||
|
|
|
@ -90,8 +90,8 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||||
value = sz;
|
value = sz;
|
||||||
}
|
}
|
||||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
|
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_app" ||
|
||||||
key == "raw_placement")
|
key == "used_for_fs" || key == "raw_placement")
|
||||||
{
|
{
|
||||||
if (!value.is_string())
|
if (!value.is_string())
|
||||||
{
|
{
|
||||||
|
@ -156,8 +156,13 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||||
{
|
{
|
||||||
new_cfg.erase("parity_chunks");
|
new_cfg.erase("parity_chunks");
|
||||||
}
|
}
|
||||||
if (new_cfg.find("used_for_fs") != new_cfg.end() && new_cfg["used_for_fs"].string_value() == "")
|
if (new_cfg.find("used_for_app") != new_cfg.end() && new_cfg["used_for_app"].string_value() == "")
|
||||||
{
|
{
|
||||||
|
new_cfg.erase("used_for_app");
|
||||||
|
}
|
||||||
|
if (new_cfg.find("used_for_app") == new_cfg.end() && new_cfg.find("used_for_fs") != new_cfg.end())
|
||||||
|
{
|
||||||
|
new_cfg["used_for_app"] = "fs:"+new_cfg["used_for_fs"].string_value();
|
||||||
new_cfg.erase("used_for_fs");
|
new_cfg.erase("used_for_fs");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include "epoll_manager.h"
|
#include "epoll_manager.h"
|
||||||
#include "pg_states.h"
|
#include "pg_states.h"
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
#include "json_util.h"
|
||||||
|
|
||||||
struct pool_creator_t
|
struct pool_creator_t
|
||||||
{
|
{
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include "cli.h"
|
#include "cli.h"
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
#include "json_util.h"
|
||||||
#include "pg_states.h"
|
#include "pg_states.h"
|
||||||
|
|
||||||
// List pools with space statistics
|
// List pools with space statistics
|
||||||
|
@ -199,7 +200,9 @@ resume_1:
|
||||||
auto & st = pool_stats[pool_id];
|
auto & st = pool_stats[pool_id];
|
||||||
for (auto & kv: pp.second.object_items())
|
for (auto & kv: pp.second.object_items())
|
||||||
{
|
{
|
||||||
if (st.find(kv.first) == st.end())
|
if (kv.first == "used_for_fs" && st.find("used_for_app") == st.end())
|
||||||
|
st["used_for_app"] = "fs:"+kv.second.string_value();
|
||||||
|
else if (st.find(kv.first) == st.end())
|
||||||
st[kv.first] = kv.second;
|
st[kv.first] = kv.second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -493,7 +496,7 @@ resume_3:
|
||||||
{ "name", "Name" },
|
{ "name", "Name" },
|
||||||
{ "id", "ID" },
|
{ "id", "ID" },
|
||||||
{ "scheme_name", "Scheme" },
|
{ "scheme_name", "Scheme" },
|
||||||
{ "used_for_fs", "Used for VitastorFS" },
|
{ "used_for_app", "Used for app" },
|
||||||
{ "status", "Status" },
|
{ "status", "Status" },
|
||||||
{ "backfillfull_fmt", "Backfillfull" },
|
{ "backfillfull_fmt", "Backfillfull" },
|
||||||
{ "pg_count_fmt", "PGs" },
|
{ "pg_count_fmt", "PGs" },
|
||||||
|
@ -663,19 +666,3 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_pool_ls(json11::Json cfg)
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string implode(const std::string & sep, json11::Json array)
|
|
||||||
{
|
|
||||||
if (array.is_number() || array.is_bool() || array.is_string())
|
|
||||||
{
|
|
||||||
return array.as_string();
|
|
||||||
}
|
|
||||||
std::string res;
|
|
||||||
bool first = true;
|
|
||||||
for (auto & item: array.array_items())
|
|
||||||
{
|
|
||||||
res += (first ? item.as_string() : sep+item.as_string());
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
|
@ -112,19 +112,21 @@ resume_1:
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_cfg.find("used_for_fs") != new_cfg.end() && !force)
|
if (new_cfg.find("used_for_app") != new_cfg.end() && !force)
|
||||||
{
|
{
|
||||||
// Check that pool doesn't have images
|
// Check that pool doesn't have images
|
||||||
auto img_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(pool_id, 0));
|
auto img_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(pool_id, 0));
|
||||||
if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id &&
|
if (img_it != parent->cli->st_cli.inode_config.end() &&
|
||||||
img_it->second.name == new_cfg["used_for_fs"].string_value())
|
INODE_POOL(img_it->first) == pool_id &&
|
||||||
|
new_cfg["used_for_app"].string_value().substr(0, 3) == "fs:" &&
|
||||||
|
img_it->second.name == new_cfg["used_for_app"].string_value().substr(3))
|
||||||
{
|
{
|
||||||
// Only allow metadata image to exist in the FS pool
|
// Only allow metadata image to exist in the FS pool
|
||||||
img_it++;
|
img_it++;
|
||||||
}
|
}
|
||||||
if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id)
|
if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id)
|
||||||
{
|
{
|
||||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" has block images, delete them before using it for VitastorFS" };
|
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" has block images, delete them before using it for VitastorFS, S3 or another app" };
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,11 +69,11 @@ struct rm_inode_t
|
||||||
});
|
});
|
||||||
if (min_offset == 0 && max_offset == 0)
|
if (min_offset == 0 && max_offset == 0)
|
||||||
{
|
{
|
||||||
total_count += objects.size();
|
total_count += rm->objects.size();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (object_id oid: objects)
|
for (object_id oid: rm->objects)
|
||||||
{
|
{
|
||||||
if (oid.stripe >= min_offset && (!max_offset || oid.stripe < max_offset))
|
if (oid.stripe >= min_offset && (!max_offset || oid.stripe < max_offset))
|
||||||
{
|
{
|
||||||
|
|
|
@ -138,7 +138,7 @@ struct wildcard_remover_t
|
||||||
if (inode_it != parent->cli->st_cli.inode_config.end())
|
if (inode_it != parent->cli->st_cli.inode_config.end())
|
||||||
fprintf(stderr, "Warning: image %s modified by someone else during deletion, restarting wildcard deletion\n", inode_it->second.name.c_str());
|
fprintf(stderr, "Warning: image %s modified by someone else during deletion, restarting wildcard deletion\n", inode_it->second.name.c_str());
|
||||||
else
|
else
|
||||||
fprintf(stderr, "Warning: inode %lx modified by someone else during deletion, retrying wildcard deletion\n", irev.inode_num);
|
fprintf(stderr, "Warning: inode %jx modified by someone else during deletion, retrying wildcard deletion\n", irev.inode_num);
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ project(vitastor)
|
||||||
# vitastor-disk
|
# vitastor-disk
|
||||||
add_executable(vitastor-disk
|
add_executable(vitastor-disk
|
||||||
disk_tool.cpp disk_simple_offsets.cpp
|
disk_tool.cpp disk_simple_offsets.cpp
|
||||||
disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp
|
disk_tool_discard.cpp disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp
|
||||||
disk_tool_resize_auto.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
|
disk_tool_resize_auto.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
|
||||||
../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
|
../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
|
||||||
)
|
)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue