Compare commits
79 Commits
v1.10.0
...
qemu-fix-c
Author | SHA1 | Date | |
---|---|---|---|
afcab324e1 | |||
819f1125ae | |||
108df7329f | |||
d32edf6cdf | |||
dca436d7e6 | |||
8129a0b4e3 | |||
704c87d512 | |||
10216a5fb5 | |||
3932eb7ff6 | |||
69cbe7bbb2 | |||
4950a1636c | |||
2eb20dff28 | |||
59f0b0427c | |||
124162ad38 | |||
391c92af1a | |||
c3d8fdd855 | |||
9ccf3af97b | |||
568a209f0d | |||
b151013201 | |||
4a763725fe | |||
b8d83cd7f4 | |||
2e9ee2fe20 | |||
508ae852e4 | |||
97ee400505 | |||
5ee4894fab | |||
125dcafb11 | |||
9f44cf71df | |||
df3c63ca7f | |||
be66edd09f | |||
ccbc0c5928 | |||
78ca4538bf | |||
86b5760ec1 | |||
27f3803d2f | |||
2ead06e126 | |||
a5d5559f8e | |||
e8e7ba8fde | |||
6fd831a299 | |||
069808dfce | |||
bcefa42bc0 | |||
4636e02d43 | |||
e4c7d1c147 | |||
a4677f3e69 | |||
7cbf207d65 | |||
7c9711af20 | |||
33ef701464 | |||
61ededa230 | |||
d9d90d3183 | |||
9dbcdbcec9 | |||
a147f7e7dc | |||
0e6bf66734 | |||
ab822d3050 | |||
d5366a0767 | |||
40b8a8b0da | |||
5c5119aba4 | |||
4edda88903 | |||
80dda3ca94 | |||
c8decb32e8 | |||
4995592e61 | |||
d9f9b0bca5 | |||
d0396267d0 | |||
b46d5db115 | |||
ecd92655fe | |||
383712148b | |||
42d40153ff | |||
561b36a4c1 | |||
685af019f5 | |||
a31592d131 | |||
28b0a2597d | |||
de6b345473 | |||
8bf52d6e96 | |||
5623dca02c | |||
abdc207297 | |||
044e621b62 | |||
ba9aabf187 | |||
5c890e4a12 | |||
0b0c2afbce | |||
651c055bd9 | |||
42eebfc1bd | |||
cef98052f5 |
@@ -414,6 +414,24 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rm_degraded:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_rm_degraded.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_snapshot_chain:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VITASTOR_VERSION "1.10.0")
|
||||
set(VITASTOR_VERSION "1.11.0")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -41,6 +41,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
|
||||
- [Автор и лицензия](docs/intro/author.ru.md)
|
||||
- Установка
|
||||
- [Пакеты](docs/installation/packages.ru.md)
|
||||
- [Docker](docs/installation/docker.ru.md)
|
||||
- [Proxmox](docs/installation/proxmox.ru.md)
|
||||
- [OpenNebula](docs/installation/opennebula.ru.md)
|
||||
- [OpenStack](docs/installation/openstack.ru.md)
|
||||
|
@@ -41,6 +41,7 @@ Read more details in the documentation. You can start from here: [Quick Start](d
|
||||
- [Author and license](docs/intro/author.en.md)
|
||||
- Installation
|
||||
- [Packages](docs/installation/packages.en.md)
|
||||
- [Docker](docs/installation/docker.en.md)
|
||||
- [Proxmox](docs/installation/proxmox.en.md)
|
||||
- [OpenNebula](docs/installation/opennebula.en.md)
|
||||
- [OpenStack](docs/installation/openstack.en.md)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VITASTOR_VERSION ?= v1.10.0
|
||||
VITASTOR_VERSION ?= v1.11.0
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v1.10.0
|
||||
image: vitalif/vitastor-csi:v1.11.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -121,7 +121,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v1.10.0
|
||||
image: vitalif/vitastor-csi:v1.11.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "1.10.0"
|
||||
vitastorCSIDriverVersion = "1.11.0"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (1.10.0-1) unstable; urgency=medium
|
||||
vitastor (1.11.0-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
11
debian/control
vendored
11
debian/control
vendored
@@ -2,7 +2,10 @@ Source: vitastor
|
||||
Section: admin
|
||||
Priority: optional
|
||||
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev
|
||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8),
|
||||
linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev,
|
||||
libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
|
||||
node-bindings <!nocheck>, node-gyp, node-nan
|
||||
Standards-Version: 4.5.0
|
||||
Homepage: https://vitastor.io/
|
||||
Rules-Requires-Root: no
|
||||
@@ -59,3 +62,9 @@ Architecture: amd64
|
||||
Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client, patch, python3, jq
|
||||
Description: Vitastor OpenNebula storage plugin
|
||||
Vitastor storage plugin for OpenNebula.
|
||||
|
||||
Package: node-vitastor
|
||||
Architecture: amd64
|
||||
Depends: ${shlibs:Depends}, ${misc:Depends}, node-bindings
|
||||
Description: Node.js bindings for Vitastor client
|
||||
Node.js native bindings for the Vitastor client library (vitastor-client).
|
||||
|
1
debian/node-vitastor.install
vendored
Normal file
1
debian/node-vitastor.install
vendored
Normal file
@@ -0,0 +1 @@
|
||||
usr/lib/x86_64-linux-gnu/nodejs/vitastor
|
12
debian/patched-qemu.Dockerfile
vendored
12
debian/patched-qemu.Dockerfile
vendored
@@ -1,8 +1,10 @@
|
||||
# Build patched QEMU for Debian inside a container
|
||||
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/patched-qemu.Dockerfile .
|
||||
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
FROM debian:$REL
|
||||
FROM $DISTRO:$REL
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
|
||||
WORKDIR /root
|
||||
@@ -20,8 +22,8 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN apt-get -y build-dep qemu
|
||||
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y build-dep qemu
|
||||
# To build a custom version
|
||||
#RUN cp /root/packages/qemu-orig/* /root
|
||||
RUN apt-get --download-only source qemu
|
||||
@@ -38,9 +40,9 @@ ADD src/client/qemu_driver.c /root/qemu_driver.c
|
||||
# apt-get install -y vitastor-client vitastor-client-dev quilt
|
||||
|
||||
RUN set -e; \
|
||||
dpkg -i /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
|
||||
DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
|
||||
apt-get update; \
|
||||
apt-get install -y quilt; \
|
||||
DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install quilt; \
|
||||
mkdir -p /root/packages/qemu-$REL; \
|
||||
rm -rf /root/packages/qemu-$REL/*; \
|
||||
cd /root/packages/qemu-$REL; \
|
||||
|
8
debian/rules
vendored
8
debian/rules
vendored
@@ -4,6 +4,14 @@ export DH_VERBOSE = 1
|
||||
%:
|
||||
dh $@
|
||||
|
||||
override_dh_install:
|
||||
perl -pe 's!prefix=/usr!prefix='`pwd`'/debian/tmp/usr!' < obj-x86_64-linux-gnu/src/client/vitastor.pc > node-binding/vitastor.pc
|
||||
cd node-binding && PKG_CONFIG_PATH=./ PKG_CONFIG_ALLOW_SYSTEM_CFLAGS=1 npm install --unsafe-perm || exit 1
|
||||
mkdir -p debian/tmp/usr/lib/x86_64-linux-gnu/nodejs/vitastor/build/Release
|
||||
cp -v node-binding/package.json node-binding/index.js node-binding/addon.cc node-binding/addon.h node-binding/client.cc node-binding/client.h debian/tmp/usr/lib/x86_64-linux-gnu/nodejs/vitastor
|
||||
cp -v node-binding/build/Release/addon.node debian/tmp/usr/lib/x86_64-linux-gnu/nodejs/vitastor/build/Release
|
||||
dh_install
|
||||
|
||||
override_dh_installdeb:
|
||||
cat debian/fio_version >> debian/vitastor-fio.substvars
|
||||
[ -f debian/qemu_version ] && (cat debian/qemu_version >> debian/vitastor-qemu.substvars) || true
|
||||
|
3
debian/vitastor.Dockerfile
vendored
3
debian/vitastor.Dockerfile
vendored
@@ -22,7 +22,8 @@ RUN set -e -x; \
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl && \
|
||||
apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake \
|
||||
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
|
||||
apt-get -y build-dep fio && \
|
||||
apt-get --download-only source fio
|
||||
|
||||
|
@@ -1,9 +1,11 @@
|
||||
# Build Docker image with Vitastor packages
|
||||
|
||||
FROM debian:bullseye
|
||||
FROM debian:bookworm
|
||||
|
||||
ADD vitastor.list /etc/apt/sources.list.d
|
||||
ADD vitastor.gpg /etc/apt/trusted.gpg.d
|
||||
ADD vitastor.pref /etc/apt/preferences.d
|
||||
ADD apt.conf /etc/apt/
|
||||
RUN apt-get update && apt-get -y install vitastor qemu-system-x86 qemu-system-common && apt-get clean
|
||||
ADD etc/apt /etc/apt/
|
||||
RUN apt-get update && apt-get -y install vitastor qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
|
||||
ADD sleep.sh /usr/bin/
|
||||
ADD install.sh /usr/bin/
|
||||
ADD scripts /opt/scripts/
|
||||
ADD etc /etc/
|
||||
RUN ln -s /usr/lib/vitastor/mon/make-etcd /usr/bin/make-etcd
|
||||
|
9
docker/Makefile
Normal file
9
docker/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
VITASTOR_VERSION ?= v1.11.0
|
||||
|
||||
all: build push
|
||||
|
||||
build:
|
||||
@docker build --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
|
||||
|
||||
push:
|
||||
@docker push vitalif/vitastor:$(VITASTOR_VERSION)
|
1
docker/etc/apt/sources.list.d/vitastor.list
Normal file
1
docker/etc/apt/sources.list.d/vitastor.list
Normal file
@@ -0,0 +1 @@
|
||||
deb http://vitastor.io/debian bookworm main
|
27
docker/etc/systemd/system/vitastor-etcd.service
Normal file
27
docker/etc/systemd/system/vitastor-etcd.service
Normal file
@@ -0,0 +1,27 @@
|
||||
[Unit]
|
||||
Description=Containerized etcd for Vitastor
|
||||
After=network-online.target local-fs.target time-sync.target docker.service vitastor-host.service
|
||||
Wants=network-online.target local-fs.target time-sync.target docker.service vitastor-host.service
|
||||
PartOf=vitastor.target
|
||||
|
||||
[Service]
|
||||
Restart=always
|
||||
Environment=GOGC=50
|
||||
EnvironmentFile=/etc/vitastor/docker.conf
|
||||
EnvironmentFile=/etc/vitastor/etcd.conf
|
||||
SyslogIdentifier=etcd
|
||||
ExecStart=bash -c 'docker run --rm -i -v /var/lib/vitastor/etcd:/data \
|
||||
--log-driver none --network host $CONTAINER_OPTIONS --name vitastor-etcd \
|
||||
$ETCD_IMAGE /usr/local/bin/etcd --name "$ETCD_NAME" --data-dir /data \
|
||||
--snapshot-count 10000 --advertise-client-urls http://$ETCD_IP:2379 --listen-client-urls http://$ETCD_IP:2379 \
|
||||
--initial-advertise-peer-urls http://$ETCD_IP:2380 --listen-peer-urls http://$ETCD_IP:2380 \
|
||||
--initial-cluster-token vitastor-etcd-1 --initial-cluster "$ETCD_INITIAL_CLUSTER" \
|
||||
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \
|
||||
--auto-compaction-retention=10 --auto-compaction-mode=revision'
|
||||
ExecStop=docker stop vitastor-etcd
|
||||
Restart=always
|
||||
StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
23
docker/etc/systemd/system/vitastor-host.service
Normal file
23
docker/etc/systemd/system/vitastor-host.service
Normal file
@@ -0,0 +1,23 @@
|
||||
[Unit]
|
||||
Description=Empty container for running Vitastor commands
|
||||
After=network-online.target local-fs.target time-sync.target docker.service
|
||||
Wants=network-online.target local-fs.target time-sync.target docker.service
|
||||
PartOf=vitastor.target
|
||||
|
||||
[Service]
|
||||
Restart=always
|
||||
EnvironmentFile=/etc/vitastor/docker.conf
|
||||
ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev \
|
||||
--privileged --log-driver none --network host --name vitastor vitastor:$VITASTOR_VERSION \
|
||||
sleep.sh'
|
||||
ExecStartPost=udevadm trigger
|
||||
ExecStop=docker stop vitastor
|
||||
WorkingDirectory=/
|
||||
PrivateTmp=false
|
||||
TasksMax=infinity
|
||||
Restart=always
|
||||
StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
23
docker/etc/systemd/system/vitastor-mon.service
Normal file
23
docker/etc/systemd/system/vitastor-mon.service
Normal file
@@ -0,0 +1,23 @@
|
||||
[Unit]
|
||||
Description=Containerized Vitastor monitor
|
||||
After=network-online.target local-fs.target time-sync.target docker.service
|
||||
Wants=network-online.target local-fs.target time-sync.target docker.service
|
||||
PartOf=vitastor.target
|
||||
|
||||
[Service]
|
||||
Restart=always
|
||||
EnvironmentFile=/etc/vitastor/docker.conf
|
||||
SyslogIdentifier=vitastor-mon
|
||||
ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /var/lib/vitastor:/var/lib/vitastor -v /dev:/dev \
|
||||
--log-driver none --network host $CONTAINER_OPTIONS --name vitastor-mon vitastor:$VITASTOR_VERSION \
|
||||
node /usr/lib/vitastor/mon/mon-main.js'
|
||||
ExecStop=docker stop vitastor-mon
|
||||
WorkingDirectory=/
|
||||
PrivateTmp=false
|
||||
TasksMax=infinity
|
||||
Restart=always
|
||||
StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
27
docker/etc/systemd/system/vitastor-osd@.service
Normal file
27
docker/etc/systemd/system/vitastor-osd@.service
Normal file
@@ -0,0 +1,27 @@
|
||||
[Unit]
|
||||
Description=Containerized Vitastor object storage daemon osd.%i
|
||||
After=network-online.target local-fs.target time-sync.target docker.service vitastor-host.service
|
||||
Wants=network-online.target local-fs.target time-sync.target docker.service vitastor-host.service
|
||||
PartOf=vitastor.target
|
||||
|
||||
[Service]
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=1048576
|
||||
LimitMEMLOCK=infinity
|
||||
EnvironmentFile=/etc/vitastor/docker.conf
|
||||
SyslogIdentifier=vitastor-osd%i
|
||||
ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev \
|
||||
$(for i in $(ls /dev/vitastor/osd%i-*); do echo --device $i:$i; done) \
|
||||
--log-driver none --network host --ulimit nofile=1048576 --ulimit memlock=-1 $CONTAINER_OPTIONS --name vitastor-osd%i \
|
||||
vitastor:$VITASTOR_VERSION vitastor-disk exec-osd /dev/vitastor/osd%i-data'
|
||||
ExecStartPre=+docker exec vitastor vitastor-disk pre-exec /dev/vitastor/osd%i-data
|
||||
ExecStop=docker stop vitastor-etcd%i
|
||||
WorkingDirectory=/
|
||||
PrivateTmp=false
|
||||
TasksMax=infinity
|
||||
Restart=always
|
||||
StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=vitastor.target
|
4
docker/etc/systemd/system/vitastor.target
Normal file
4
docker/etc/systemd/system/vitastor.target
Normal file
@@ -0,0 +1,4 @@
|
||||
[Unit]
|
||||
Description=vitastor target
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
7
docker/etc/udev/rules.d/90-vitastor.rules
Normal file
7
docker/etc/udev/rules.d/90-vitastor.rules
Normal file
@@ -0,0 +1,7 @@
|
||||
SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de13059903", \
|
||||
OWNER="vitastor", GROUP="vitastor", \
|
||||
IMPORT{program}="/usr/bin/docker exec vitastor vitastor-disk udev $devnode", \
|
||||
SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
|
||||
|
||||
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
|
||||
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
|
11
docker/etc/vitastor/docker.conf
Normal file
11
docker/etc/vitastor/docker.conf
Normal file
@@ -0,0 +1,11 @@
|
||||
#
|
||||
# Configuration file for containerized Vitastor installation
|
||||
# (non-Kubernetes, with systemd and udev-based orchestration)
|
||||
#
|
||||
|
||||
# Desired Vitastor version
|
||||
VITASTOR_VERSION=1.11.0
|
||||
|
||||
# Additional arguments for all containers
|
||||
# For example, you may want to specify a custom logging driver here
|
||||
CONTAINER_OPTIONS=""
|
4
docker/etc/vitastor/etcd.conf
Normal file
4
docker/etc/vitastor/etcd.conf
Normal file
@@ -0,0 +1,4 @@
|
||||
ETCD_IMAGE=quay.io/coreos/etcd:v3.5.18
|
||||
ETCD_NAME=""
|
||||
ETCD_IP=""
|
||||
ETCD_INITIAL_CLUSTER=""
|
2
docker/etc/vitastor/vitastor.conf
Normal file
2
docker/etc/vitastor/vitastor.conf
Normal file
@@ -0,0 +1,2 @@
|
||||
{
|
||||
}
|
9
docker/install.sh
Executable file
9
docker/install.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
cp -urv /etc/default /host-etc/
|
||||
cp -urv /etc/systemd /host-etc/
|
||||
cp -urv /etc/udev /host-etc/
|
||||
cp -urnv /etc/vitastor /host-etc/
|
||||
cp -urnv /opt/scripts/* /host-bin/
|
3
docker/scripts/vitastor-cli
Executable file
3
docker/scripts/vitastor-cli
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker exec -it vitastor vitastor-cli "$@"
|
3
docker/scripts/vitastor-disk
Executable file
3
docker/scripts/vitastor-disk
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker exec -it vitastor vitastor-disk "$@"
|
3
docker/scripts/vitastor-fio
Executable file
3
docker/scripts/vitastor-fio
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker exec -it vitastor fio "$@"
|
3
docker/scripts/vitastor-nbd
Executable file
3
docker/scripts/vitastor-nbd
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker exec -it vitastor vitastor-nbd "$@"
|
3
docker/sleep.sh
Executable file
3
docker/sleep.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
while :; do sleep infinity; done
|
@@ -1 +0,0 @@
|
||||
deb http://vitastor.io/debian bullseye main
|
@@ -13,7 +13,7 @@ Vitastor configuration consists of:
|
||||
- [Separate OSD settings](config/pool.en.md#osd-settings)
|
||||
- [Inode configuration](config/inode.en.md) i.e. image metadata like name, size and parent reference
|
||||
|
||||
Configuration parameters can be set in 3 places:
|
||||
Configuration parameters can be set in 4 places:
|
||||
- Configuration file (`/etc/vitastor/vitastor.conf` or other path)
|
||||
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd
|
||||
connection parameters should obviously be set in the configuration file.
|
||||
|
@@ -14,7 +14,7 @@
|
||||
- [Настроек инодов](config/inode.ru.md), т.е. метаданных образов, таких, как имя, размер и ссылки на
|
||||
родительский образ
|
||||
|
||||
Параметры конфигурации могут задаваться в 3 местах:
|
||||
Параметры конфигурации могут задаваться в 4 местах:
|
||||
- Файле конфигурации (`/etc/vitastor/vitastor.conf` или по другому пути)
|
||||
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может
|
||||
задаваться там, кроме, естественно, самих параметров соединения с etcd,
|
||||
|
@@ -13,6 +13,7 @@ affect their interaction with the cluster.
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_retry_enospc](#client_retry_enospc)
|
||||
- [client_wait_up_timeout](#client_wait_up_timeout)
|
||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||
- [client_enable_writeback](#client_enable_writeback)
|
||||
@@ -70,6 +71,19 @@ and clients are not blocked and just get EIO error code instead.
|
||||
Retry writes on out of space errors to wait until some space is freed on
|
||||
OSDs.
|
||||
|
||||
## client_wait_up_timeout
|
||||
|
||||
- Type: seconds
|
||||
- Default: 16
|
||||
- Can be changed online: yes
|
||||
|
||||
Wait for this number of seconds until PGs are up when doing operations
|
||||
which require all PGs to be up. Currently only used by object listings
|
||||
in delete and merge-based commands ([vitastor-cli rm](../usage/cli.en.md#rm), merge and so on).
|
||||
|
||||
The default value is calculated as `1 + OSD lease timeout`, which is
|
||||
`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
|
||||
|
||||
## client_max_dirty_bytes
|
||||
|
||||
- Type: integer
|
||||
|
@@ -13,6 +13,7 @@
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_retry_enospc](#client_retry_enospc)
|
||||
- [client_wait_up_timeout](#client_wait_up_timeout)
|
||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||
- [client_enable_writeback](#client_enable_writeback)
|
||||
@@ -72,6 +73,19 @@ RDMA и хотите повысить пиковую производитель
|
||||
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
|
||||
ожидать, пока на OSD не освободится место.
|
||||
|
||||
## client_wait_up_timeout
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 16
|
||||
- Можно менять на лету: да
|
||||
|
||||
Время ожидания поднятия PG при операциях, требующих активности всех PG.
|
||||
В данный момент используется листингами объектов в командах, использующих
|
||||
удаление и слияние ([vitastor-cli rm](../usage/cli.ru.md#rm), merge и подобные).
|
||||
|
||||
Значение по умолчанию вычисляется как `1 + время lease OSD`, равное
|
||||
`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
|
||||
|
||||
## client_max_dirty_bytes
|
||||
|
||||
- Тип: целое число
|
||||
|
@@ -316,7 +316,7 @@ for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
|
||||
decrease write performance for fast disks because page cache is an overhead
|
||||
itself.
|
||||
|
||||
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
|
||||
Choose "directsync" to use [immediate_commit](layout-cluster.en.md#immediate_commit)
|
||||
(which requires disable_data_fsync) with drives having write-back cache
|
||||
which can't be turned off, for example, Intel Optane. Also note that *some*
|
||||
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
|
||||
|
@@ -43,7 +43,7 @@ Parameters:
|
||||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [used_for_fs](#used_for_fs)
|
||||
- [used_for_app](#used_for_app)
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -377,24 +377,37 @@ of the OSDs containing a data chunk for a PG.
|
||||
Automatic scrubbing interval for this pool. Overrides
|
||||
[global scrub_interval setting](osd.en.md#scrub_interval).
|
||||
|
||||
## used_for_fs
|
||||
## used_for_app
|
||||
|
||||
- Type: string
|
||||
|
||||
If non-empty, the pool is marked as used for VitastorFS with metadata stored
|
||||
in block image (regular Vitastor volume) named as the value of this pool parameter.
|
||||
If non-empty, the pool is marked as used for a separate application, for example,
|
||||
VitastorFS or S3, which allocates Vitastor volume IDs by itself and does not use
|
||||
image/inode metadata in etcd.
|
||||
|
||||
When a pool is marked as used for VitastorFS, regular block volume creation in it
|
||||
When a pool is marked as used for such app, regular block volume creation in it
|
||||
is disabled (vitastor-cli refuses to create images without --force) to protect
|
||||
the user from block volume and FS file ID collisions and data loss.
|
||||
the user from block volume and FS/S3 volume ID collisions and data loss.
|
||||
|
||||
[vitastor-nfs](../usage/nfs.ru.md), in its turn, refuses to use pools not marked
|
||||
Also such pools do not calculate per-inode space usage statistics in etcd because
|
||||
using it for an external application implies that it may contain a very large
|
||||
number of volumes and their statistics may take too much space in etcd.
|
||||
|
||||
Setting used_for_app to `fs:<name>` tells Vitastor that the pool is used for VitastorFS
|
||||
with VitastorKV metadata base stored in a block image (regular Vitastor volume) named
|
||||
`<name>`.
|
||||
|
||||
[vitastor-nfs](../usage/nfs.en.md), in its turn, refuses to use pools not marked
|
||||
for the corresponding FS when starting. This also implies that you can use one
|
||||
pool only for one VitastorFS.
|
||||
|
||||
The second thing that is disabled for VitastorFS pools is reporting per-inode space
|
||||
usage statistics in etcd because a FS pool may store a very large number of files
|
||||
and statistics for them all would take a lot of space in etcd.
|
||||
If you plan to use the pool for S3, set its used_for_app to `s3:<name>`. `<name>` may
|
||||
be basically anything you want (for example, `s3:standard`) - it's not validated
|
||||
by Vitastor S3 components in any way.
|
||||
|
||||
All other values except prefixed with `fs:` or `s3:` may be used freely and don't
|
||||
mean anything special for Vitastor core components. For now, you can use them as
|
||||
you wish.
|
||||
|
||||
# Examples
|
||||
|
||||
|
@@ -42,7 +42,7 @@
|
||||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [used_for_fs](#used_for_fs)
|
||||
- [used_for_app](#used_for_app)
|
||||
|
||||
Примеры:
|
||||
|
||||
@@ -256,7 +256,7 @@ PG в Vitastor эферемерны, то есть вы можете менят
|
||||
|
||||
## raw_placement
|
||||
|
||||
- Type: string
|
||||
- Тип: строка
|
||||
|
||||
Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
|
||||
Используйте, только если действительно знаете, зачем вам это надо :)
|
||||
@@ -383,26 +383,39 @@ OSD с "all".
|
||||
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
|
||||
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
|
||||
|
||||
## used_for_fs
|
||||
## used_for_app
|
||||
|
||||
- Type: string
|
||||
- Тип: строка
|
||||
|
||||
Если непусто, пул помечается как используемый для файловой системы VitastorFS с
|
||||
метаданными, хранимыми в блочном образе Vitastor с именем, равным значению
|
||||
этого параметра.
|
||||
Если непусто, пул помечается как используемый для отдельного приложения, например,
|
||||
для VitastorFS или S3, которое распределяет ID образов в пуле само и не использует
|
||||
метаданные образов/инодов в etcd.
|
||||
|
||||
Когда пул помечается как используемый для VitastorFS, создание обычных блочных
|
||||
образов в нём отключается (vitastor-cli отказывается создавать образы без --force),
|
||||
чтобы защитить пользователя от коллизий ID файлов и блочных образов и, таким
|
||||
образом, от потери данных.
|
||||
Когда пул помечается используемым для такого приложения, создание обычных блочных
|
||||
образов в нём запрещается (vitastor-cli отказывается создавать образы без --force),
|
||||
чтобы защитить пользователя от коллизий ID блочных образов и томов ФС/S3, и,
|
||||
таким образом, от потери данных.
|
||||
|
||||
Также для таких пулов отключается передача статистики в etcd по отдельным инодам,
|
||||
так как использование для внешнего приложения подразумевает, что пул может содержать
|
||||
очень много томов и их статистика может занять слишком много места в etcd.
|
||||
|
||||
Установка used_for_app в значение `fs:<name>` сообщает о том, что пул используется
|
||||
для VitastorFS с базой метаданных VitastorKV, хранимой в блочном образе с именем
|
||||
`<name>`.
|
||||
|
||||
[vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
|
||||
использовать для ФС пулы, не выделенные для неё. Это также означает, что один
|
||||
пул может использоваться только для одной VitastorFS.
|
||||
использовать для ФС пулы, не помеченные, как используемые для неё. Это также
|
||||
означает, что один пул может использоваться только для одной VitastorFS.
|
||||
|
||||
Также для ФС-пулов отключается передача статистики в etcd по отдельным инодам,
|
||||
так как ФС-пул может содержать очень много файлов и статистика по ним всем
|
||||
заняла бы очень много места в etcd.
|
||||
Если же вы планируете использовать пул для данных S3, установите его used_for_app
|
||||
в значение `s3:<name>`, где `<name>` - любое название по вашему усмотрению
|
||||
(например, `s3:standard`) - конкретное содержимое `<name>` пока никак не проверяется
|
||||
компонентами Vitastor S3.
|
||||
|
||||
Все остальные значения used_for_app, кроме начинающихся на `fs:` или `s3:`, не
|
||||
означают ничего особенного для основных компонентов Vitastor. Поэтому сейчас вы
|
||||
можете использовать их свободно любым желаемым способом.
|
||||
|
||||
# Примеры
|
||||
|
||||
|
@@ -61,6 +61,24 @@
|
||||
info_ru: |
|
||||
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
|
||||
ожидать, пока на OSD не освободится место.
|
||||
- name: client_wait_up_timeout
|
||||
type: sec
|
||||
default: 16
|
||||
online: true
|
||||
info: |
|
||||
Wait for this number of seconds until PGs are up when doing operations
|
||||
which require all PGs to be up. Currently only used by object listings
|
||||
in delete and merge-based commands ([vitastor-cli rm](../usage/cli.en.md#rm), merge and so on).
|
||||
|
||||
The default value is calculated as `1 + OSD lease timeout`, which is
|
||||
`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
|
||||
info_ru: |
|
||||
Время ожидания поднятия PG при операциях, требующих активности всех PG.
|
||||
В данный момент используется листингами объектов в командах, использующих
|
||||
удаление и слияние ([vitastor-cli rm](../usage/cli.ru.md#rm), merge и подобные).
|
||||
|
||||
Значение по умолчанию вычисляется как `1 + время lease OSD`, равное
|
||||
`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
|
||||
- name: client_max_dirty_bytes
|
||||
type: int
|
||||
default: 33554432
|
||||
|
@@ -14,8 +14,12 @@
|
||||
|
||||
{{../../installation/packages.en.md}}
|
||||
|
||||
{{../../installation/docker.en.md}}
|
||||
|
||||
{{../../installation/proxmox.en.md}}
|
||||
|
||||
{{../../installation/opennebula.en.md}}
|
||||
|
||||
{{../../installation/openstack.en.md}}
|
||||
|
||||
{{../../installation/kubernetes.en.md}}
|
||||
|
@@ -14,8 +14,12 @@
|
||||
|
||||
{{../../installation/packages.ru.md}}
|
||||
|
||||
{{../../installation/docker.ru.md}}
|
||||
|
||||
{{../../installation/proxmox.ru.md}}
|
||||
|
||||
{{../../installation/opennebula.ru.md}}
|
||||
|
||||
{{../../installation/openstack.ru.md}}
|
||||
|
||||
{{../../installation/kubernetes.ru.md}}
|
||||
|
@@ -315,7 +315,7 @@
|
||||
decrease write performance for fast disks because page cache is an overhead
|
||||
itself.
|
||||
|
||||
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
|
||||
Choose "directsync" to use [immediate_commit](layout-cluster.en.md#immediate_commit)
|
||||
(which requires disable_data_fsync) with drives having write-back cache
|
||||
which can't be turned off, for example, Intel Optane. Also note that *some*
|
||||
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
|
||||
|
60
docs/installation/docker.en.md
Normal file
60
docs/installation/docker.en.md
Normal file
@@ -0,0 +1,60 @@
|
||||
[Documentation](../../README.md#documentation) → Installation → Dockerized Installation
|
||||
|
||||
-----
|
||||
|
||||
[Читать на русском](docker.ru.md)
|
||||
|
||||
# Dockerized Installation
|
||||
|
||||
Vitastor may be installed in Docker/Podman. In such setups etcd, monitors and OSD
|
||||
all run in containers, but everything else looks as close as possible to a usual
|
||||
setup with packages:
|
||||
- host network is used
|
||||
- auto-start is implemented through udev and systemd
|
||||
- logs are written to journald (not docker json log files)
|
||||
- command-line wrapper scripts are installed to the host system to call vitastor-disk,
|
||||
vitastor-cli and others through the container
|
||||
|
||||
Such installations may be useful when it's impossible or inconvenient to install
|
||||
Vitastor from packages, for example, in exotic Linux distributions.
|
||||
|
||||
If you don't want just a simple containerized installation, you can also take a look
|
||||
at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operator
|
||||
|
||||
## Installing Containers
|
||||
|
||||
The instruction is very simple.
|
||||
|
||||
1. Download a Docker image of the desired version: \
|
||||
`docker pull vitastor:1.10.2`
|
||||
2. Install scripts to the host system: \
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:1.10.2 install.sh`
|
||||
3. Reload udev rules: \
|
||||
`udevadm control --reload-rules`
|
||||
|
||||
And you can return to [Quick Start](../intro/quickstart.en.md).
|
||||
|
||||
## Upgrading Containers
|
||||
|
||||
First make sure to check the topic [Upgrading Vitastor](../usage/admin.en.md#upgrading-vitastor)
|
||||
to figure out if you need any additional steps.
|
||||
|
||||
Then, to upgrade a containerized installation, you just need to change the `VITASTOR_VERSION`
|
||||
option in `/etc/vitastor/docker.conf` and restart all Vitastor services:
|
||||
|
||||
`systemctl restart vitastor.target`
|
||||
|
||||
## QEMU
|
||||
|
||||
Vitastor Docker image also contains QEMU, qemu-img and qemu-storage-daemon built with Vitastor support.
|
||||
|
||||
However, running QEMU in Docker is harder to setup and it depends on the used virtualization UI
|
||||
(OpenNebula, Proxmox and so on). Some of them also required patched Libvirt.
|
||||
|
||||
That's why containerized installation of Vitastor doesn't contain a ready-made QEMU setup and it's
|
||||
recommended to install QEMU from packages or build it manually.
|
||||
|
||||
## fio
|
||||
|
||||
Vitastor Docker image also contains fio and installs a wrapper called `vitastor-fio` to use it from
|
||||
the host system.
|
60
docs/installation/docker.ru.md
Normal file
60
docs/installation/docker.ru.md
Normal file
@@ -0,0 +1,60 @@
|
||||
[Документация](../../README-ru.md#документация) → Установка → Установка в Docker
|
||||
|
||||
-----
|
||||
|
||||
[Read in English](docker.en.md)
|
||||
|
||||
# Установка в Docker
|
||||
|
||||
Vitastor можно установить в Docker/Podman. При этом etcd, мониторы и OSD запускаются
|
||||
в контейнерах, но всё остальное выглядит максимально приближенно к установке из пакетов:
|
||||
- используется сеть хост-системы
|
||||
- для автозапуска используются udev и systemd
|
||||
- журналы записываются в journald (не в json-файлы журналов docker)
|
||||
- в хост-систему устанавливаются обёртки для вызова консольных инструментов vitastor-disk,
|
||||
vitastor-cli и других через контейнер
|
||||
|
||||
Такая установка полезна тогда, когда установка из пакетов невозможна или неудобна,
|
||||
например, в нестандартных Linux-дистрибутивах.
|
||||
|
||||
Если вам нужна не просто контейнеризованная инсталляция, вы также можете обратить внимание
|
||||
на Vitastor Kubernetes-оператор: https://github.com/Antilles7227/vitastor-operator
|
||||
|
||||
## Установка контейнеров
|
||||
|
||||
Инструкция по установке максимально простая.
|
||||
|
||||
1. Скачайте Docker-образ желаемой версии: \
|
||||
`docker pull vitastor:1.10.2`
|
||||
2. Установите скрипты в хост-систему командой: \
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:1.10.2 install.sh`
|
||||
3. Перезагрузите правила udev: \
|
||||
`udevadm control --reload-rules`
|
||||
|
||||
После этого вы можете возвращаться к разделу [Быстрый старт](../intro/quickstart.ru.md).
|
||||
|
||||
## Обновление контейнеров
|
||||
|
||||
Сначала обязательно проверьте раздел [Обновление Vitastor](../usage/admin.ru.md#обновление-vitastor),
|
||||
чтобы понять, не требуются ли вам какие-то дополнительные действия.
|
||||
|
||||
После этого для обновления Docker-инсталляции вам нужно просто поменять опцию `VITASTOR_VERSION`
|
||||
в файле `/etc/vitastor/docker.conf` и перезапустить все сервисы Vitastor командой:
|
||||
|
||||
`systemctl restart vitastor.target`
|
||||
|
||||
## QEMU
|
||||
|
||||
В Docker-образ также входят QEMU, qemu-img и qemu-storage-daemon, собранные с поддержкой Vitastor.
|
||||
|
||||
Однако настроить запуск QEMU в Docker сложнее и способ запуска зависит от используемого интерфейса
|
||||
виртуализации (OpenNebula, Proxmox и т.п.). Также для OpenNebula, например, требуется патченый
|
||||
Libvirt.
|
||||
|
||||
Поэтому по умолчанию Docker-сборка пока что не включает в себя готового способа запуска QEMU
|
||||
и QEMU рекомендуется устанавливать из пакетов или собирать самостоятельно.
|
||||
|
||||
## fio
|
||||
|
||||
fio также входит в Docker-контейнер vitastor, и в хост-систему устанавливается обёртка `vitastor-fio`
|
||||
для запуска fio в контейнер.
|
@@ -14,6 +14,7 @@
|
||||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
|
||||
- Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
|
||||
stable version from 0.9.x branch instead of 1.x
|
||||
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
@@ -14,6 +14,7 @@
|
||||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
|
||||
- Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
|
||||
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
|
||||
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
@@ -26,13 +26,13 @@
|
||||
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
|
||||
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
|
||||
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
||||
- [Install Vitastor packages](../installation/packages.en.md).
|
||||
- Either [install Vitastor packages](../installation/packages.en.md) or [install Vitastor in Docker](../installation/docker.en.md).
|
||||
|
||||
## Recommended drives
|
||||
|
||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320/P5530, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
|
||||
|
||||
## Configure monitors
|
||||
@@ -45,7 +45,8 @@ On the monitor hosts:
|
||||
}
|
||||
```
|
||||
- Create systemd units for etcd by running: `/usr/lib/vitastor/mon/make-etcd`
|
||||
- Start etcd and monitors: `systemctl enable --now etcd vitastor-mon`
|
||||
Or, if you installed Vitastor in Docker, run `systemctl start vitastor-host; docker exec vitastor make-etcd`.
|
||||
- Start etcd and monitors: `systemctl enable --now vitastor-etcd vitastor-mon`
|
||||
|
||||
## Configure OSDs
|
||||
|
||||
|
@@ -26,13 +26,13 @@
|
||||
обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места).
|
||||
- Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2.
|
||||
- Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
||||
- [Установите пакеты Vitastor](../installation/packages.ru.md).
|
||||
- Либо [установите пакеты Vitastor](../installation/packages.ru.md), либо [установите Vitastor в Docker](../installation/docker.ru.md).
|
||||
|
||||
## Рекомендуемые диски
|
||||
|
||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320/P5530, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
|
||||
|
||||
## Настройте мониторы
|
||||
@@ -44,8 +44,9 @@
|
||||
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"]
|
||||
}
|
||||
```
|
||||
- Инициализируйте сервисы etcd, запустив `/usr/lib/vitastor/mon/make-etcd`
|
||||
- Запустите etcd и мониторы: `systemctl enable --now etcd vitastor-mon`
|
||||
- Инициализируйте сервисы etcd, запустив `/usr/lib/vitastor/mon/make-etcd`.\
|
||||
Либо, если вы установили Vitastor в Docker, запустите `systemctl start vitastor-host; docker exec vitastor make-etcd`.
|
||||
- Запустите etcd и мониторы: `systemctl enable --now vitastor-etcd vitastor-mon`
|
||||
|
||||
## Настройте OSD
|
||||
|
||||
|
@@ -37,7 +37,7 @@ It supports the following commands:
|
||||
Global options:
|
||||
|
||||
```
|
||||
--config_file FILE Path to Vitastor configuration file
|
||||
--config_path FILE Path to Vitastor configuration file
|
||||
--etcd_address URL Etcd connection address
|
||||
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
|
||||
--parallel_osds M Work with M osds in parallel when possible (default 4)
|
||||
@@ -146,6 +146,7 @@ Rename, resize image or change its readonly status. Images with children can't b
|
||||
If the new size is smaller than the old size, extra data will be purged.
|
||||
You should resize file system in the image, if present, before shrinking it.
|
||||
|
||||
* `--deleted 1|0` - Set/clear 'deleted image' flag (set automatically during unfinished deletes).
|
||||
* `-f|--force` - Proceed with shrinking or setting readwrite flag even if the image has children.
|
||||
* `--down-ok` - Proceed with shrinking even if some data will be left on unavailable OSDs.
|
||||
|
||||
@@ -221,6 +222,7 @@ Remove inode data without changing metadata.
|
||||
Requires more memory, but allows to show correct removal progress.
|
||||
--min-offset Purge only data starting with specified offset.
|
||||
--max-offset Purge only data before specified offset.
|
||||
--client_wait_up_timeout 16 Timeout for waiting until PGs are up in seconds.
|
||||
```
|
||||
|
||||
## merge-data
|
||||
@@ -396,7 +398,8 @@ Optional parameters:
|
||||
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
|
||||
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
||||
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
||||
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |
|
||||
| `--used_for_app fs:<name>` | Mark pool as used for VitastorFS with metadata in image `<name>` |
|
||||
| `--used_for_app s3:<name>` | Mark pool as used for S3 location with name `<name>` |
|
||||
| `--pg_stripe_size <number>` | Increase object grouping stripe |
|
||||
| `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input |
|
||||
| `--wait` | Wait for the new pool to come online |
|
||||
|
@@ -36,7 +36,7 @@ vitastor-cli - интерфейс командной строки для адм
|
||||
Глобальные опции:
|
||||
|
||||
```
|
||||
--config_file FILE Путь к файлу конфигурации Vitastor
|
||||
--config_path FILE Путь к файлу конфигурации Vitastor
|
||||
--etcd_address URL Адрес соединения с etcd
|
||||
--iodepth N Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
|
||||
--parallel_osds M Работать параллельно с M OSD (по умолчанию 4)
|
||||
@@ -149,6 +149,7 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
Если новый размер меньше старого, "лишние" данные будут удалены, поэтому перед уменьшением
|
||||
образа сначала уменьшите файловую систему в нём.
|
||||
|
||||
* `--deleted 1|0` - Установить/снять флаг "образ удалён" (устанавливается при незавершённом удалении).
|
||||
* `-f|--force` - Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны.
|
||||
* `--down-ok` - Разрешить уменьшение, даже если часть данных останется неудалённой на недоступных OSD.
|
||||
|
||||
@@ -226,6 +227,7 @@ vitastor-cli dd [iimg=<image> | if=<file>] [oimg=<image> | of=<file>] [bs=1M] \
|
||||
Требует больше памяти, но позволяет правильно печатать прогресс удаления.
|
||||
--min-offset Удалять только данные, начиная с заданного смещения.
|
||||
--max-offset Удалять только данные до (исключительно) заданного смещения.
|
||||
--client_wait_up_timeout 16 Время ожидания поднятия PG в секундах.
|
||||
```
|
||||
|
||||
## merge-data
|
||||
|
@@ -36,7 +36,7 @@ It will output a block device name like /dev/nbd0 which you can then use as a no
|
||||
|
||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
||||
|
||||
vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
|
||||
vitastor-nbd supports all usual Vitastor configuration options like `--config_path <path_to_config>` plus NBD-specific:
|
||||
|
||||
* `--nbd_timeout 0` \
|
||||
Timeout for I/O operations in seconds after exceeding which the kernel stops the device.
|
||||
@@ -54,16 +54,18 @@ vitastor-nbd supports all usual Vitastor configuration options like `--config_fi
|
||||
Stay in foreground, do not daemonize.
|
||||
|
||||
Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
|
||||
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
|
||||
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_path`.
|
||||
|
||||
## unmap
|
||||
|
||||
To unmap the device run:
|
||||
|
||||
```
|
||||
vitastor-nbd unmap /dev/nbd0
|
||||
vitastor-nbd unmap [--force] /dev/nbd0
|
||||
```
|
||||
|
||||
If `--force` is specified, `vitastor-nbd` doesn't check if the device is actually mapped.
|
||||
|
||||
## ls
|
||||
|
||||
```
|
||||
|
@@ -41,7 +41,7 @@ vitastor-nbd map [/dev/nbdN] --image testimg
|
||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
|
||||
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_path <path_to_config>`,
|
||||
плюс специфичные для NBD:
|
||||
|
||||
* `--nbd_timeout 0` \
|
||||
@@ -62,16 +62,19 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
|
||||
|
||||
Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
|
||||
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
|
||||
заданном опцией `--config_file`.
|
||||
заданном опцией `--config_path`.
|
||||
|
||||
## unmap
|
||||
|
||||
Для отключения устройства выполните:
|
||||
|
||||
```
|
||||
vitastor-nbd unmap /dev/nbd0
|
||||
vitastor-nbd unmap [--force] /dev/nbd0
|
||||
```
|
||||
|
||||
Если задана опция `--force`, `vitastor-nbd` не проверяет, подключено ли устройство,
|
||||
перед попыткой его отключить.
|
||||
|
||||
## ls
|
||||
|
||||
```
|
||||
|
@@ -58,7 +58,7 @@ To use VitastorFS:
|
||||
2. Create an image for FS metadata, preferably in a faster (SSD or replica-HDD) pool,
|
||||
but you can create it in the data pool too if you want (image size doesn't matter):
|
||||
`vitastor-cli create -s 10G -p fastpool testfs`
|
||||
3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
|
||||
3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-app fs:testfs data-pool`
|
||||
4. Either mount the FS: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
||||
5. Or start the NFS server: `vitastor-nfs start --fs testfs --pool data-pool`
|
||||
|
||||
|
@@ -60,7 +60,7 @@ JSON-формате :-). Для инспекции содержимого БД
|
||||
или по крайней мере на HDD, но без EC), но можно и в том же пуле, что данные
|
||||
(размер образа значения не имеет):
|
||||
`vitastor-cli create -s 10G -p fastpool testfs`
|
||||
3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
|
||||
3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-app fs:testfs data-pool`
|
||||
4. Либо примонтируйте ФС: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
||||
5. Либо запустите сетевой NFS-сервер: `vitastor-nfs start --fs testfs --pool data-pool`
|
||||
|
||||
|
@@ -23,6 +23,9 @@ class AntiEtcdAdapter
|
||||
}, {}));
|
||||
const cfg_port = config.antietcd_port;
|
||||
const is_local = local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
|
||||
is_local['0.0.0.0'] = true;
|
||||
is_local['::'] = true;
|
||||
is_local[''] = true;
|
||||
const selected = cluster.map(s => s.split(':', 2)).filter(ip => is_local[ip[0]] && (!cfg_port || ip[1] == cfg_port));
|
||||
if (selected.length > 1)
|
||||
{
|
||||
|
@@ -216,6 +216,7 @@ const etcd_tree = {
|
||||
parent_pool?: <pool_id>,
|
||||
parent_id?: <inode_t>,
|
||||
readonly?: boolean,
|
||||
deleted?: boolean,
|
||||
}
|
||||
}
|
||||
}, */
|
||||
|
34
mon/mon.js
34
mon/mon.js
@@ -773,23 +773,27 @@ class Mon
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const pool_id in this.state.pool.stats)
|
||||
if (!this.recheck_pgs_active)
|
||||
{
|
||||
if (!seen_pools[pool_id])
|
||||
// PG recheck also modifies /pool/stats, so don't touch it here if it's active
|
||||
for (const pool_id in this.state.pool.stats)
|
||||
{
|
||||
txn.push({ requestDeleteRange: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
delete this.state.pool.stats[pool_id];
|
||||
}
|
||||
else
|
||||
{
|
||||
const pool_stats = { ...this.state.pool.stats[pool_id] };
|
||||
serialize_bigints(pool_stats);
|
||||
txn.push({ requestPut: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(pool_stats)),
|
||||
} });
|
||||
if (!seen_pools[pool_id])
|
||||
{
|
||||
txn.push({ requestDeleteRange: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
delete this.state.pool.stats[pool_id];
|
||||
}
|
||||
else
|
||||
{
|
||||
const pool_stats = { ...this.state.pool.stats[pool_id] };
|
||||
serialize_bigints(pool_stats);
|
||||
txn.push({ requestPut: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(pool_stats)),
|
||||
} });
|
||||
}
|
||||
}
|
||||
}
|
||||
if (txn.length)
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor-mon",
|
||||
"version": "1.10.0",
|
||||
"version": "1.11.0",
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
@@ -9,7 +9,7 @@
|
||||
"author": "Vitaliy Filippov",
|
||||
"license": "UNLICENSED",
|
||||
"dependencies": {
|
||||
"antietcd": "^1.1.0",
|
||||
"antietcd": "^1.1.2",
|
||||
"sprintf-js": "^1.1.2",
|
||||
"ws": "^7.2.5"
|
||||
},
|
||||
|
@@ -8,23 +8,9 @@ const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
|
||||
const { scale_pg_count } = require('./pg_utils.js');
|
||||
const { make_hier_tree, filter_osds_by_root_node,
|
||||
filter_osds_by_tags, filter_osds_by_block_layout, get_affinity_osds } = require('./osd_tree.js');
|
||||
const { select_murmur3 } = require('./lp_optimizer/murmur3.js');
|
||||
|
||||
let seed;
|
||||
|
||||
function reset_rng()
|
||||
{
|
||||
seed = 0x5f020e43;
|
||||
}
|
||||
|
||||
function rng()
|
||||
{
|
||||
seed ^= seed << 13;
|
||||
seed ^= seed >> 17;
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
}
|
||||
|
||||
function pick_primary(pool_config, osd_set, up_osds, aff_osds)
|
||||
function pick_primary(pool_id, pg_num, pool_config, osd_set, up_osds, aff_osds)
|
||||
{
|
||||
let alive_set;
|
||||
if (pool_config.scheme === 'replicated')
|
||||
@@ -52,7 +38,7 @@ function pick_primary(pool_config, osd_set, up_osds, aff_osds)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
return alive_set[rng() % alive_set.length];
|
||||
return alive_set[select_murmur3(alive_set.length, osd_num => pool_id+'/'+pg_num+'/'+osd_num)];
|
||||
}
|
||||
|
||||
function recheck_primary(state, global_config, up_osds, osd_tree)
|
||||
@@ -66,7 +52,6 @@ function recheck_primary(state, global_config, up_osds, osd_tree)
|
||||
continue;
|
||||
}
|
||||
const aff_osds = get_affinity_osds(pool_cfg, up_osds, osd_tree);
|
||||
reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
if (!state.pg.config.items[pool_id])
|
||||
@@ -76,7 +61,7 @@ function recheck_primary(state, global_config, up_osds, osd_tree)
|
||||
const pg_cfg = state.pg.config.items[pool_id][pg_num];
|
||||
if (pg_cfg)
|
||||
{
|
||||
const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
|
||||
const new_primary = pick_primary(pool_id, pg_num, state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
|
||||
if (pg_cfg.primary != new_primary)
|
||||
{
|
||||
if (!new_pg_config)
|
||||
@@ -99,13 +84,12 @@ function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revis
|
||||
{
|
||||
const aff_osds = get_affinity_osds(state.config.pools[pool_id] || {}, up_osds, osd_tree);
|
||||
const pg_items = {};
|
||||
reset_rng();
|
||||
new_pgs.map((osd_set, i) =>
|
||||
{
|
||||
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
|
||||
pg_items[i+1] = {
|
||||
osd_set,
|
||||
primary: pick_primary(state.config.pools[pool_id], osd_set, up_osds, aff_osds),
|
||||
primary: pick_primary(pool_id, i+1, state.config.pools[pool_id], osd_set, up_osds, aff_osds),
|
||||
};
|
||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
||||
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
||||
|
@@ -33,9 +33,11 @@ async function run()
|
||||
console.log(config_path+' is missing');
|
||||
process.exit(1);
|
||||
}
|
||||
if (fs.existsSync("/etc/systemd/system/etcd.service"))
|
||||
const in_docker = fs.existsSync("/etc/vitastor/etcd.conf") &&
|
||||
fs.existsSync("/etc/vitastor/docker.conf");
|
||||
if (!in_docker && fs.existsSync("/etc/systemd/system/vitastor-etcd.service"))
|
||||
{
|
||||
console.log("/etc/systemd/system/etcd.service already exists");
|
||||
console.log("/etc/systemd/system/vitastor-etcd.service already exists");
|
||||
process.exit(1);
|
||||
}
|
||||
const config = JSON.parse(fs.readFileSync(config_path, { encoding: 'utf-8' }));
|
||||
@@ -52,10 +54,21 @@ async function run()
|
||||
console.log('No matching IPs in etcd_address from '+config_path);
|
||||
process.exit(0);
|
||||
}
|
||||
const etcd_cluster = etcds.map((e, i) => `etcd${i}=http://${e}:2380`).join(',');
|
||||
await system(`mkdir -p /var/lib/etcd${num}.etcd`);
|
||||
const etcd_name = 'etcd'+etcds[num].replace(/[^0-9a-z_]/ig, '_');
|
||||
const etcd_cluster = etcds.map(e => `etcd${e.replace(/[^0-9a-z_]/ig, '_')}=http://${e}:2380`).join(',');
|
||||
if (in_docker)
|
||||
{
|
||||
let etcd_conf = fs.readFileSync("/etc/vitastor/etcd.conf", { encoding: 'utf-8' });
|
||||
etcd_conf = replace_env(etcd_conf, 'ETCD_NAME', etcd_name);
|
||||
etcd_conf = replace_env(etcd_conf, 'ETCD_IP', etcds[num]);
|
||||
etcd_conf = replace_env(etcd_conf, 'ETCD_INITIAL_CLUSTER', etcd_cluster);
|
||||
fs.writeFileSync("/etc/vitastor/etcd.conf", etcd_conf);
|
||||
console.log('etcd for Vitastor configured. Run `systemctl enable --now vitastor-etcd` to start etcd');
|
||||
process.exit(0);
|
||||
}
|
||||
await system(`mkdir -p /var/lib/etcd`);
|
||||
fs.writeFileSync(
|
||||
"/etc/systemd/system/etcd.service",
|
||||
"/etc/systemd/system/vitastor-etcd.service",
|
||||
`[Unit]
|
||||
Description=etcd for vitastor
|
||||
After=network-online.target local-fs.target time-sync.target
|
||||
@@ -64,14 +77,14 @@ Wants=network-online.target local-fs.target time-sync.target
|
||||
[Service]
|
||||
Restart=always
|
||||
Environment=GOGC=50
|
||||
ExecStart=etcd -name etcd${num} --data-dir /var/lib/etcd${num}.etcd \\
|
||||
ExecStart=etcd --name ${etcd_name} --data-dir /var/lib/etcd \\
|
||||
--snapshot-count 10000 --advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
|
||||
--initial-advertise-peer-urls http://${etcds[num]}:2380 --listen-peer-urls http://${etcds[num]}:2380 \\
|
||||
--initial-cluster-token vitastor-etcd-1 --initial-cluster ${etcd_cluster} \\
|
||||
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
|
||||
--auto-compaction-retention=10 --auto-compaction-mode=revision
|
||||
WorkingDirectory=/var/lib/etcd${num}.etcd
|
||||
ExecStartPre=+chown -R etcd /var/lib/etcd${num}.etcd
|
||||
WorkingDirectory=/var/lib/etcd
|
||||
ExecStartPre=+chown -R etcd /var/lib/etcd
|
||||
User=etcd
|
||||
PrivateTmp=false
|
||||
TasksMax=infinity
|
||||
@@ -89,6 +102,13 @@ WantedBy=multi-user.target
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
function replace_env(text, key, value)
|
||||
{
|
||||
let found = false;
|
||||
text = text.replace(new RegExp('^'+key+'\\s*=.*', 'm'), () => { found = true; return key+'='+value; });
|
||||
return found ? text : text.replace(/\s*$/, '\n')+key+'='+value+'\n';
|
||||
}
|
||||
|
||||
function select_local_etcd(etcds)
|
||||
{
|
||||
const ifaces = os.networkInterfaces();
|
||||
|
@@ -5,6 +5,7 @@ Wants=network-online.target local-fs.target time-sync.target
|
||||
|
||||
[Service]
|
||||
Restart=always
|
||||
SyslogIdentifier=vitastor-mon
|
||||
ExecStart=node /usr/lib/vitastor/mon/mon-main.js
|
||||
WorkingDirectory=/
|
||||
User=vitastor
|
||||
|
@@ -8,6 +8,7 @@ PartOf=vitastor.target
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=1048576
|
||||
LimitMEMLOCK=infinity
|
||||
SyslogIdentifier=vitastor-osd%i
|
||||
# Use the following for direct logs to files
|
||||
#ExecStart=bash -c 'exec vitastor-disk exec-osd /dev/vitastor/osd%i-data >>/var/log/vitastor/osd%i.log 2>&1'
|
||||
ExecStart=vitastor-disk exec-osd /dev/vitastor/osd%i-data
|
||||
|
@@ -14,8 +14,13 @@ NAN_MODULE_INIT(InitAddon)
|
||||
|
||||
Nan::SetPrototypeMethod(tpl, "read", NodeVitastor::Read);
|
||||
Nan::SetPrototypeMethod(tpl, "write", NodeVitastor::Write);
|
||||
Nan::SetPrototypeMethod(tpl, "delete", NodeVitastor::Delete);
|
||||
Nan::SetPrototypeMethod(tpl, "sync", NodeVitastor::Sync);
|
||||
Nan::SetPrototypeMethod(tpl, "read_bitmap", NodeVitastor::ReadBitmap);
|
||||
Nan::SetPrototypeMethod(tpl, "on_ready", NodeVitastor::OnReady);
|
||||
Nan::SetPrototypeMethod(tpl, "get_min_io_size", NodeVitastor::GetMinIoSize);
|
||||
Nan::SetPrototypeMethod(tpl, "get_max_atomic_write_size", NodeVitastor::GetMaxAtomicWriteSize);
|
||||
Nan::SetPrototypeMethod(tpl, "get_immediate_commit", NodeVitastor::GetImmediateCommit);
|
||||
//Nan::SetPrototypeMethod(tpl, "destroy", NodeVitastor::Destroy);
|
||||
|
||||
Nan::Set(target, Nan::New("Client").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
|
||||
@@ -63,6 +68,10 @@ NAN_MODULE_INIT(InitAddon)
|
||||
Nan::Set(target, Nan::New("ENOSYS").ToLocalChecked(), Nan::New<v8::Int32>(-ENOSYS));
|
||||
Nan::Set(target, Nan::New("EAGAIN").ToLocalChecked(), Nan::New<v8::Int32>(-EAGAIN));
|
||||
|
||||
Nan::Set(target, Nan::New("IMMEDIATE_NONE").ToLocalChecked(), Nan::New<v8::Int32>(IMMEDIATE_NONE));
|
||||
Nan::Set(target, Nan::New("IMMEDIATE_SMALL").ToLocalChecked(), Nan::New<v8::Int32>(IMMEDIATE_SMALL));
|
||||
Nan::Set(target, Nan::New("IMMEDIATE_ALL").ToLocalChecked(), Nan::New<v8::Int32>(IMMEDIATE_ALL));
|
||||
|
||||
// Listing handle
|
||||
|
||||
tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorKVListing::Create);
|
||||
|
@@ -5,9 +5,10 @@
|
||||
|
||||
#define NODE_VITASTOR_READ 1
|
||||
#define NODE_VITASTOR_WRITE 2
|
||||
#define NODE_VITASTOR_SYNC 3
|
||||
#define NODE_VITASTOR_READ_BITMAP 4
|
||||
#define NODE_VITASTOR_GET_INFO 5
|
||||
#define NODE_VITASTOR_DELETE 3
|
||||
#define NODE_VITASTOR_SYNC 4
|
||||
#define NODE_VITASTOR_READ_BITMAP 5
|
||||
#define NODE_VITASTOR_GET_INFO 6
|
||||
|
||||
#ifndef INODE_POOL
|
||||
#define INODE_POOL(inode) (uint32_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
@@ -80,6 +81,11 @@ NAN_METHOD(NodeVitastor::Create)
|
||||
NodeVitastor* cli = new NodeVitastor();
|
||||
cli->c = vitastor_c_create_uring_json(c_cfg, cfg.size());
|
||||
delete[] c_cfg;
|
||||
if (!cli->c)
|
||||
{
|
||||
Nan::ThrowError("failed to initialize io_uring (old kernel or insufficient ulimit -l?)");
|
||||
return;
|
||||
}
|
||||
|
||||
int res = vitastor_c_uring_register_eventfd(cli->c);
|
||||
if (res >= 0)
|
||||
@@ -133,12 +139,12 @@ NodeVitastorRequest* NodeVitastor::get_read_request(const Nan::FunctionCallbackI
|
||||
return req;
|
||||
}
|
||||
|
||||
// read(pool, inode, offset, len, callback(err, buffer, version))
|
||||
// read(pool, inode, offset, length, callback(err, buffer, version))
|
||||
NAN_METHOD(NodeVitastor::Read)
|
||||
{
|
||||
TRACE("NodeVitastor::Read");
|
||||
if (info.Length() < 5)
|
||||
Nan::ThrowError("Not enough arguments to read(pool, inode, offset, len, callback(err, buffer, version))");
|
||||
Nan::ThrowError("Not enough arguments to read(pool, inode, offset, length, callback(err, buffer, version))");
|
||||
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
|
||||
@@ -149,6 +155,9 @@ NAN_METHOD(NodeVitastor::Read)
|
||||
|
||||
self->Ref();
|
||||
vitastor_c_read(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, &req->iov, 1, on_read_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(self->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
||||
@@ -217,6 +226,58 @@ NAN_METHOD(NodeVitastor::Write)
|
||||
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
||||
req->iov_list.size() ? req->iov_list.size() : 1,
|
||||
on_write_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(self->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
NodeVitastorRequest* NodeVitastor::get_delete_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
|
||||
{
|
||||
uint64_t offset = get_ui64(info[argpos+0]);
|
||||
uint64_t len = get_ui64(info[argpos+1]);
|
||||
uint64_t version = 0;
|
||||
if (!info[argpos+2].IsEmpty() &&
|
||||
!info[argpos+2]->IsFunction() &&
|
||||
info[argpos+2]->IsObject())
|
||||
{
|
||||
auto key = Nan::New<v8::String>("version").ToLocalChecked();
|
||||
auto params = info[argpos+2].As<v8::Object>();
|
||||
auto versionObj = Nan::Get(params, key).ToLocalChecked();
|
||||
if (!versionObj.IsEmpty())
|
||||
version = get_ui64(versionObj);
|
||||
argpos++;
|
||||
}
|
||||
|
||||
v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(this, callback);
|
||||
|
||||
req->offset = offset;
|
||||
req->len = len;
|
||||
req->version = version;
|
||||
|
||||
return req;
|
||||
}
|
||||
|
||||
// delete(pool, inode, offset, length, { version }?, callback(err))
|
||||
NAN_METHOD(NodeVitastor::Delete)
|
||||
{
|
||||
TRACE("NodeVitastor::Delete");
|
||||
if (info.Length() < 5)
|
||||
Nan::ThrowError("Not enough arguments to delete(pool, inode, offset, length, { version }?, callback(err))");
|
||||
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
|
||||
uint64_t pool = get_ui64(info[0]);
|
||||
uint64_t inode = get_ui64(info[1]);
|
||||
|
||||
auto req = self->get_delete_request(info, 2);
|
||||
|
||||
self->Ref();
|
||||
vitastor_c_delete(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, req->version,
|
||||
on_write_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(self->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// sync(callback(err))
|
||||
@@ -233,14 +294,17 @@ NAN_METHOD(NodeVitastor::Sync)
|
||||
|
||||
self->Ref();
|
||||
vitastor_c_sync(self->c, on_write_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(self->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
// read_bitmap(pool, inode, offset, length, with_parents, callback(err, bitmap_buffer))
|
||||
NAN_METHOD(NodeVitastor::ReadBitmap)
|
||||
{
|
||||
TRACE("NodeVitastor::ReadBitmap");
|
||||
if (info.Length() < 6)
|
||||
Nan::ThrowError("Not enough arguments to read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))");
|
||||
Nan::ThrowError("Not enough arguments to read_bitmap(pool, inode, offset, length, with_parents, callback(err, bitmap_buffer))");
|
||||
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
|
||||
@@ -254,6 +318,9 @@ NAN_METHOD(NodeVitastor::ReadBitmap)
|
||||
auto req = new NodeVitastorRequest(self, callback);
|
||||
self->Ref();
|
||||
vitastor_c_read_bitmap(self->c, ((pool << (64-POOL_ID_BITS)) | inode), offset, len, with_parents, on_read_bitmap_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(self->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long retval)
|
||||
@@ -267,6 +334,67 @@ static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long
|
||||
nanCallback.Call(1, args, req);
|
||||
}
|
||||
|
||||
// on_ready(callback(err))
|
||||
NAN_METHOD(NodeVitastor::OnReady)
|
||||
{
|
||||
TRACE("NodeVitastor::OnReady");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to on_ready(callback(err))");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(self, callback);
|
||||
self->Ref();
|
||||
vitastor_c_on_ready(self->c, on_ready_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(self->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
void NodeVitastor::on_ready_finish(void *opaque, long retval)
|
||||
{
|
||||
TRACE("NodeVitastor::on_ready_finish");
|
||||
auto req = (NodeVitastorRequest*)opaque;
|
||||
auto self = req->cli;
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
nanCallback.Call(0, NULL, req);
|
||||
self->Unref();
|
||||
delete req;
|
||||
}
|
||||
|
||||
// get_min_io_size(pool_id)
|
||||
NAN_METHOD(NodeVitastor::GetMinIoSize)
|
||||
{
|
||||
TRACE("NodeVitastor::GetMinIoSize");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to get_min_io_size(pool_id)");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
uint64_t pool = get_ui64(info[0]);
|
||||
info.GetReturnValue().Set(Nan::New<v8::Number>(vitastor_c_inode_get_bitmap_granularity(self->c, INODE_WITH_POOL(pool, 1))));
|
||||
}
|
||||
|
||||
// get_max_atomic_write_size(pool_id)
|
||||
NAN_METHOD(NodeVitastor::GetMaxAtomicWriteSize)
|
||||
{
|
||||
TRACE("NodeVitastor::GetMaxAtomicWriteSize");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to get_max_atomic_write_size(pool_id)");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
uint64_t pool = get_ui64(info[0]);
|
||||
info.GetReturnValue().Set(Nan::New<v8::Number>(vitastor_c_inode_get_block_size(self->c, INODE_WITH_POOL(pool, 1))));
|
||||
}
|
||||
|
||||
// get_immediate_commit(pool_id)
|
||||
NAN_METHOD(NodeVitastor::GetImmediateCommit)
|
||||
{
|
||||
TRACE("NodeVitastor::GetImmediateCommit");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to get_immediate_commit(pool_id)");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
uint64_t pool = get_ui64(info[0]);
|
||||
info.GetReturnValue().Set(Nan::New<v8::Number>(vitastor_c_inode_get_immediate_commit(self->c, INODE_WITH_POOL(pool, 1))));
|
||||
}
|
||||
|
||||
void NodeVitastor::on_read_finish(void *opaque, long retval, uint64_t version)
|
||||
{
|
||||
TRACE("NodeVitastor::on_read_finish");
|
||||
@@ -364,6 +492,9 @@ NAN_METHOD(NodeVitastorImage::Create)
|
||||
img->Ref();
|
||||
cli->Ref();
|
||||
vitastor_c_watch_inode(cli->c, (char*)img->name.c_str(), on_watch_start, img);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(cli->c);
|
||||
#endif
|
||||
|
||||
info.GetReturnValue().Set(info.This());
|
||||
}
|
||||
@@ -378,12 +509,12 @@ NodeVitastorImage::~NodeVitastorImage()
|
||||
cli->Unref();
|
||||
}
|
||||
|
||||
// read(offset, len, callback(err, buffer, version))
|
||||
// read(offset, length, callback(err, buffer, version))
|
||||
NAN_METHOD(NodeVitastorImage::Read)
|
||||
{
|
||||
TRACE("NodeVitastorImage::Read");
|
||||
if (info.Length() < 3)
|
||||
Nan::ThrowError("Not enough arguments to read(offset, len, callback(err, buffer, version))");
|
||||
Nan::ThrowError("Not enough arguments to read(offset, length, callback(err, buffer, version))");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
@@ -394,12 +525,12 @@ NAN_METHOD(NodeVitastorImage::Read)
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
// write(offset, buffer, { version }?, callback(err))
|
||||
// write(offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
NAN_METHOD(NodeVitastorImage::Write)
|
||||
{
|
||||
TRACE("NodeVitastorImage::Write");
|
||||
if (info.Length() < 3)
|
||||
Nan::ThrowError("Not enough arguments to write(offset, buffer, { version }?, callback(err))");
|
||||
Nan::ThrowError("Not enough arguments to write(offset, buf: Buffer | Buffer[], { version }?, callback(err))");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
@@ -410,6 +541,22 @@ NAN_METHOD(NodeVitastorImage::Write)
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
// delete(offset, length, { version }?, callback(err))
|
||||
NAN_METHOD(NodeVitastorImage::Delete)
|
||||
{
|
||||
TRACE("NodeVitastorImage::Delete");
|
||||
if (info.Length() < 3)
|
||||
Nan::ThrowError("Not enough arguments to delete(offset, length, { version }?, callback(err))");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
auto req = img->cli->get_delete_request(info, 0);
|
||||
req->img = img;
|
||||
req->op = NODE_VITASTOR_DELETE;
|
||||
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
// sync(callback(err))
|
||||
NAN_METHOD(NodeVitastorImage::Sync)
|
||||
{
|
||||
@@ -427,12 +574,12 @@ NAN_METHOD(NodeVitastorImage::Sync)
|
||||
img->exec_or_wait(req);
|
||||
}
|
||||
|
||||
// read_bitmap(offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
// read_bitmap(offset, length, with_parents, callback(err, bitmap_buffer))
|
||||
NAN_METHOD(NodeVitastorImage::ReadBitmap)
|
||||
{
|
||||
TRACE("NodeVitastorImage::ReadBitmap");
|
||||
if (info.Length() < 4)
|
||||
Nan::ThrowError("Not enough arguments to read_bitmap(offset, len, with_parents, callback(err, bitmap_buffer))");
|
||||
Nan::ThrowError("Not enough arguments to read_bitmap(offset, length, with_parents, callback(err, bitmap_buffer))");
|
||||
|
||||
NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
|
||||
|
||||
@@ -488,6 +635,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||
cli->Ref();
|
||||
vitastor_c_read(cli->c, ino, req->offset, req->len, &req->iov, 1, NodeVitastor::on_read_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(cli->c);
|
||||
#endif
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_WRITE)
|
||||
{
|
||||
@@ -497,6 +647,19 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||
req->iov_list.size() ? req->iov_list.data() : &req->iov,
|
||||
req->iov_list.size() ? req->iov_list.size() : 1,
|
||||
NodeVitastor::on_write_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(cli->c);
|
||||
#endif
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_DELETE)
|
||||
{
|
||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||
cli->Ref();
|
||||
vitastor_c_delete(cli->c, ino, req->offset, req->len, req->version,
|
||||
NodeVitastor::on_write_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(cli->c);
|
||||
#endif
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_SYNC)
|
||||
{
|
||||
@@ -506,6 +669,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||
if (imm != IMMEDIATE_ALL)
|
||||
{
|
||||
vitastor_c_sync(cli->c, NodeVitastor::on_write_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(cli->c);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -517,6 +683,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
|
||||
uint64_t ino = vitastor_c_inode_get_num(watch);
|
||||
cli->Ref();
|
||||
vitastor_c_read_bitmap(cli->c, ino, req->offset, req->len, req->with_parents, NodeVitastor::on_read_bitmap_finish, req);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(cli->c);
|
||||
#endif
|
||||
}
|
||||
else if (req->op == NODE_VITASTOR_GET_INFO)
|
||||
{
|
||||
@@ -648,6 +817,9 @@ NAN_METHOD(NodeVitastorKV::Open)
|
||||
delete req;
|
||||
kv->Unref();
|
||||
});
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(kv->cli->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// close(callback(err))
|
||||
@@ -671,6 +843,9 @@ NAN_METHOD(NodeVitastorKV::Close)
|
||||
delete req;
|
||||
kv->Unref();
|
||||
});
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(kv->cli->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// set_config({ ...config })
|
||||
@@ -729,6 +904,9 @@ void NodeVitastorKV::get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info,
|
||||
delete req;
|
||||
kv->Unref();
|
||||
}, allow_cache);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(kv->cli->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// get(key, callback(err, value))
|
||||
@@ -801,6 +979,9 @@ NAN_METHOD(NodeVitastorKV::Set)
|
||||
delete cas_req;
|
||||
kv->Unref();
|
||||
}, cas_cb);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(kv->cli->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// del(key, callback(err), cas_compare(old_value)?)
|
||||
@@ -839,6 +1020,9 @@ NAN_METHOD(NodeVitastorKV::Del)
|
||||
delete cas_req;
|
||||
kv->Unref();
|
||||
}, cas_cb);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(kv->cli->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// list(start_key?)
|
||||
@@ -909,7 +1093,7 @@ NodeVitastorKVListing::~NodeVitastorKVListing()
|
||||
kv->Unref();
|
||||
}
|
||||
|
||||
// next(callback(err, value)?)
|
||||
// next(callback(err, key, value)?)
|
||||
NAN_METHOD(NodeVitastorKVListing::Next)
|
||||
{
|
||||
TRACE("NodeVitastorKVListing::Next");
|
||||
@@ -959,6 +1143,9 @@ NAN_METHOD(NodeVitastorKVListing::Next)
|
||||
list->iter = req;
|
||||
list->kv->Unref();
|
||||
});
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_c_uring_handle_events(list->kv->cli->c);
|
||||
#endif
|
||||
}
|
||||
|
||||
// close()
|
||||
|
@@ -15,14 +15,24 @@ class NodeVitastor: public Nan::ObjectWrap
|
||||
public:
|
||||
// constructor({ ...config })
|
||||
static NAN_METHOD(Create);
|
||||
// read(pool, inode, offset, len, callback(err, buffer, version))
|
||||
// read(pool_id, inode, offset, length, callback(err, buffer, version))
|
||||
static NAN_METHOD(Read);
|
||||
// write(pool, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
// write(pool_id, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
static NAN_METHOD(Write);
|
||||
// delete(pool_id, inode, offset, length, { version }?, callback(err))
|
||||
static NAN_METHOD(Delete);
|
||||
// sync(callback(err))
|
||||
static NAN_METHOD(Sync);
|
||||
// read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
// read_bitmap(pool_id, inode, offset, length, with_parents, callback(err, bitmap_buffer))
|
||||
static NAN_METHOD(ReadBitmap);
|
||||
// on_ready(callback(err))
|
||||
static NAN_METHOD(OnReady);
|
||||
// get_min_io_size(pool_id)
|
||||
static NAN_METHOD(GetMinIoSize);
|
||||
// get_max_atomic_write_size(pool_id)
|
||||
static NAN_METHOD(GetMaxAtomicWriteSize);
|
||||
// get_immediate_commit(pool_id)
|
||||
static NAN_METHOD(GetImmediateCommit);
|
||||
// // destroy()
|
||||
// static NAN_METHOD(Destroy);
|
||||
|
||||
@@ -37,11 +47,13 @@ private:
|
||||
|
||||
static void on_io_readable(uv_poll_t* handle, int status, int revents);
|
||||
static void on_read_finish(void *opaque, long retval, uint64_t version);
|
||||
static void on_ready_finish(void *opaque, long retval);
|
||||
static void on_write_finish(void *opaque, long retval);
|
||||
static void on_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap);
|
||||
|
||||
NodeVitastorRequest* get_read_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos);
|
||||
NodeVitastorRequest* get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos);
|
||||
NodeVitastorRequest* get_delete_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos);
|
||||
|
||||
friend class NodeVitastorImage;
|
||||
friend class NodeVitastorKV;
|
||||
@@ -53,13 +65,15 @@ class NodeVitastorImage: public Nan::ObjectWrap
|
||||
public:
|
||||
// constructor(node_vitastor, name)
|
||||
static NAN_METHOD(Create);
|
||||
// read(offset, len, callback(err, buffer, version))
|
||||
// read(offset, length, callback(err, buffer, version))
|
||||
static NAN_METHOD(Read);
|
||||
// write(offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
static NAN_METHOD(Write);
|
||||
// delete(offset, length, { version }?, callback(err))
|
||||
static NAN_METHOD(Delete);
|
||||
// sync(callback(err))
|
||||
static NAN_METHOD(Sync);
|
||||
// read_bitmap(offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
// read_bitmap(offset, length, with_parents, callback(err, bitmap_buffer))
|
||||
static NAN_METHOD(ReadBitmap);
|
||||
// get_info(callback({ num, name, size, parent_id?, readonly?, meta?, mod_revision, block_size, bitmap_granularity, immediate_commit }))
|
||||
static NAN_METHOD(GetInfo);
|
||||
@@ -120,7 +134,7 @@ class NodeVitastorKVListing: public Nan::ObjectWrap
|
||||
public:
|
||||
// constructor(node_vitastor_kv, start_key?)
|
||||
static NAN_METHOD(Create);
|
||||
// next(callback(err, value)?)
|
||||
// next(callback(err, key, value)?)
|
||||
static NAN_METHOD(Next);
|
||||
// close()
|
||||
static NAN_METHOD(Close);
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor",
|
||||
"version": "1.7.0",
|
||||
"version": "1.11.0",
|
||||
"description": "Low-level native bindings to Vitastor client library",
|
||||
"main": "index.js",
|
||||
"keywords": [
|
||||
|
@@ -98,8 +98,8 @@ vm.elements.each 'TEMPLATE/DISK' do |d|
|
||||
cmd = 'vitastor-cli'
|
||||
qemu_arg = ''
|
||||
if d.elements['VITASTOR_CONF']
|
||||
cmd = cmd + ' --config_path ' + d.elements['VITASTOR_CONF']
|
||||
qemu_arg += 'config_path='+d.elements['VITASTOR_CONF']+':'
|
||||
cmd = cmd + ' --config_path ' + d.elements['VITASTOR_CONF'].text
|
||||
qemu_arg += 'config_path=' + d.elements['VITASTOR_CONF'].text + ':'
|
||||
end
|
||||
|
||||
draw = "#{bck_dir}/disk.#{did}.raw"
|
||||
|
@@ -37,16 +37,6 @@ sub run_cli
|
||||
$json = 1 if !defined $json;
|
||||
my $binary = delete $args{binary};
|
||||
$binary = '/usr/bin/vitastor-cli' if !defined $binary;
|
||||
if (!exists($args{errfunc}))
|
||||
{
|
||||
$args{errfunc} = sub
|
||||
{
|
||||
my $line = shift;
|
||||
print STDERR $line;
|
||||
*STDERR->flush();
|
||||
$stderr .= $line;
|
||||
};
|
||||
}
|
||||
if (!exists($args{outfunc}))
|
||||
{
|
||||
$retval = '';
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VITASTOR_VERSION = '1.10.0'
|
||||
VITASTOR_VERSION = '1.11.0'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.10.0
|
||||
Version: 1.11.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.10.0.el7.tar.gz
|
||||
Source0: vitastor-1.11.0.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.10.0
|
||||
Version: 1.11.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.10.0.el8.tar.gz
|
||||
Source0: vitastor-1.11.0.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.10.0
|
||||
Version: 1.11.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.10.0.el9.tar.gz
|
||||
Source0: vitastor-1.11.0.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVITASTOR_VERSION="1.10.0")
|
||||
add_definitions(-DVITASTOR_VERSION="1.11.0")
|
||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
if (${WITH_ASAN})
|
||||
|
@@ -256,6 +256,7 @@ resume_2:
|
||||
}
|
||||
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
|
||||
{
|
||||
std::sort(entries_to_zero.begin(), entries_to_zero.end());
|
||||
// we have to zero out additional entries
|
||||
for (i = 0; i < entries_to_zero.size(); )
|
||||
{
|
||||
@@ -338,6 +339,15 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||
if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
|
||||
{
|
||||
printf("Metadata entry %ju is corrupt (checksum mismatch), skipping\n", done_cnt+i);
|
||||
// zero out the invalid entry, otherwise we'll hit "tried to overwrite non-zero metadata entry" later
|
||||
if (bs->inmemory_meta)
|
||||
{
|
||||
memset(entry, 0, bs->dsk.clean_entry_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
entries_to_zero.push_back(done_cnt+i);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@@ -6,6 +6,10 @@
|
||||
#include "cluster_client_impl.h"
|
||||
#include "json_util.h"
|
||||
|
||||
#define TRY_SEND_OFFLINE 0
|
||||
#define TRY_SEND_CONNECTING 1
|
||||
#define TRY_SEND_OK 2
|
||||
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
|
||||
{
|
||||
wb = new writeback_cache_t();
|
||||
@@ -59,6 +63,10 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
|
||||
st_cli.parse_config(config);
|
||||
st_cli.infinite_start = false;
|
||||
if (!config["client_infinite_start"].is_null())
|
||||
{
|
||||
st_cli.infinite_start = config["client_infinite_start"].bool_value();
|
||||
}
|
||||
st_cli.load_global_config();
|
||||
|
||||
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
||||
@@ -67,6 +75,18 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
|
||||
cluster_client_t::~cluster_client_t()
|
||||
{
|
||||
if (retry_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(retry_timeout_id);
|
||||
retry_timeout_duration = 0;
|
||||
retry_timeout_id = -1;
|
||||
}
|
||||
if (list_retry_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(list_retry_timeout_id);
|
||||
list_retry_timeout_id = -1;
|
||||
list_retry_time = {};
|
||||
}
|
||||
msgr.repeer_pgs = [](osd_num_t){};
|
||||
if (ringloop)
|
||||
{
|
||||
@@ -87,6 +107,46 @@ cluster_op_t::~cluster_op_t()
|
||||
}
|
||||
}
|
||||
|
||||
bool cluster_op_t::support_left_on_dead()
|
||||
{
|
||||
if (!parts.size())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
for (auto & part: parts)
|
||||
{
|
||||
if (!(part.flags & PART_DONE) ||
|
||||
part.op.reply.hdr.opcode != OSD_OP_DELETE ||
|
||||
part.op.reply.hdr.retval != 0 ||
|
||||
!(part.op.reply.del.flags & OSD_DEL_SUPPORT_LEFT_ON_DEAD))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<osd_num_t> cluster_op_t::get_left_on_dead()
|
||||
{
|
||||
std::set<osd_num_t> osds;
|
||||
for (auto & part: parts)
|
||||
{
|
||||
if ((part.flags & PART_DONE) ||
|
||||
part.op.reply.hdr.opcode == OSD_OP_DELETE &&
|
||||
part.op.reply.hdr.retval == 0 &&
|
||||
(part.op.reply.del.flags & OSD_DEL_LEFT_ON_DEAD) != 0)
|
||||
{
|
||||
int del_count = (OSD_PACKET_SIZE-sizeof(part.op.reply.del)) / sizeof(uint32_t);
|
||||
if (del_count > part.op.reply.del.left_on_dead_count)
|
||||
del_count = part.op.reply.del.left_on_dead_count;
|
||||
uint32_t *left_on_dead = (uint32_t*)((&part.op.reply.del) + 1);
|
||||
for (int i = 0; i < del_count; i++)
|
||||
osds.insert(left_on_dead[i]);
|
||||
}
|
||||
}
|
||||
return std::vector<osd_num_t>(osds.begin(), osds.end());
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_raw_ops(osd_num_t peer_osd)
|
||||
{
|
||||
auto it = raw_ops.find(peer_osd);
|
||||
@@ -134,12 +194,12 @@ void cluster_client_t::unshift_op(cluster_op_t *op)
|
||||
void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait = 0;
|
||||
if (op->opcode == OSD_OP_WRITE)
|
||||
if (op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE)
|
||||
{
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
{
|
||||
if (prev->opcode == OSD_OP_SYNC ||
|
||||
prev->opcode == OSD_OP_WRITE && !(op->flags & OP_FLUSH_BUFFER) && (prev->flags & OP_FLUSH_BUFFER))
|
||||
(prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_DELETE) && !(op->flags & OP_FLUSH_BUFFER) && (prev->flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
@@ -151,7 +211,8 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
{
|
||||
if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && (!(prev->flags & OP_IMMEDIATE_COMMIT) || enable_writeback))
|
||||
if (prev->opcode == OSD_OP_SYNC || (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_DELETE) &&
|
||||
(!(prev->flags & OP_IMMEDIATE_COMMIT) || enable_writeback))
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
@@ -167,7 +228,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
|
||||
void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
|
||||
{
|
||||
if (opcode != OSD_OP_WRITE && opcode != OSD_OP_SYNC)
|
||||
if (opcode != OSD_OP_WRITE && opcode != OSD_OP_DELETE && opcode != OSD_OP_SYNC)
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -176,10 +237,10 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
while (next)
|
||||
{
|
||||
auto n2 = next->next;
|
||||
if (opcode == OSD_OP_WRITE
|
||||
if ((opcode == OSD_OP_WRITE || opcode == OSD_OP_DELETE)
|
||||
? (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
|
||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
|
||||
: (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE))
|
||||
(next->opcode == OSD_OP_WRITE || next->opcode == OSD_OP_DELETE) && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
|
||||
: (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE || next->opcode == OSD_OP_DELETE))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
@@ -252,16 +313,17 @@ void cluster_client_t::erase_op(cluster_op_t *op)
|
||||
}
|
||||
if (flags & OP_FLUSH_BUFFER)
|
||||
{
|
||||
auto overflow = std::move(wb->writeback_overflow);
|
||||
int i = 0;
|
||||
while (i < wb->writeback_overflow.size() && wb->writebacks_active < client_max_writeback_iodepth)
|
||||
while (i < overflow.size() && wb->writebacks_active < client_max_writeback_iodepth)
|
||||
{
|
||||
execute_internal(wb->writeback_overflow[i]);
|
||||
execute_internal(overflow[i]);
|
||||
i++;
|
||||
}
|
||||
if (i > 0)
|
||||
{
|
||||
wb->writeback_overflow.erase(wb->writeback_overflow.begin(), wb->writeback_overflow.begin()+i);
|
||||
}
|
||||
overflow.erase(overflow.begin(), overflow.begin()+i);
|
||||
assert(!wb->writeback_overflow.size());
|
||||
wb->writeback_overflow.swap(overflow);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -314,7 +376,7 @@ void cluster_client_t::reset_retry_timer(int new_duration)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (retry_timeout_id)
|
||||
if (retry_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(retry_timeout_id);
|
||||
}
|
||||
@@ -322,7 +384,7 @@ void cluster_client_t::reset_retry_timer(int new_duration)
|
||||
retry_timeout_id = tfd->set_timer(retry_timeout_duration, false, [this](int)
|
||||
{
|
||||
int time_passed = retry_timeout_duration;
|
||||
retry_timeout_id = 0;
|
||||
retry_timeout_id = -1;
|
||||
retry_timeout_duration = 0;
|
||||
continue_ops(time_passed);
|
||||
});
|
||||
@@ -397,6 +459,16 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
||||
}
|
||||
// client_retry_enospc
|
||||
client_retry_enospc = config["client_retry_enospc"].is_null() ? true : config["client_retry_enospc"].bool_value();
|
||||
// client_wait_up_timeout
|
||||
if (!config["client_wait_up_timeout"].is_null())
|
||||
client_wait_up_timeout = config["client_wait_up_timeout"].uint64_value();
|
||||
else
|
||||
{
|
||||
auto etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
||||
if (!etcd_report_interval)
|
||||
etcd_report_interval = 5;
|
||||
client_wait_up_timeout = 1+etcd_report_interval+(st_cli.max_etcd_attempts*(2*st_cli.etcd_quick_timeout)+999)/1000;
|
||||
}
|
||||
// log_level
|
||||
log_level = config["log_level"].uint64_value();
|
||||
msgr.parse_config(config);
|
||||
@@ -434,7 +506,7 @@ void cluster_client_t::on_change_pool_config_hook()
|
||||
// And now they have to be resliced!
|
||||
for (auto op = op_queue_head; op; op = op->next)
|
||||
{
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE || op->opcode == OSD_OP_READ ||
|
||||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
|
||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||
{
|
||||
@@ -457,6 +529,7 @@ void cluster_client_t::on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_nu
|
||||
}
|
||||
// Always continue to resume operations hung because of lack of the primary OSD
|
||||
continue_ops();
|
||||
continue_lists();
|
||||
}
|
||||
|
||||
bool cluster_client_t::get_immediate_commit(uint64_t inode)
|
||||
@@ -477,6 +550,7 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
|
||||
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
|
||||
{
|
||||
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||
continue_lists();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -559,7 +633,8 @@ bool cluster_client_t::flush()
|
||||
void cluster_client_t::execute(cluster_op_t *op)
|
||||
{
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP &&
|
||||
op->opcode != OSD_OP_WRITE && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
auto cb = std::move(op->callback);
|
||||
@@ -571,7 +646,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
offline_ops.push_back(op);
|
||||
return;
|
||||
}
|
||||
op->flags = op->flags & OSD_OP_IGNORE_READONLY; // the only allowed flag
|
||||
op->flags = op->flags & (OSD_OP_IGNORE_READONLY | OSD_OP_WAIT_UP_TIMEOUT); // allowed client flags
|
||||
execute_internal(op);
|
||||
}
|
||||
|
||||
@@ -592,7 +667,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
|
||||
!op->version /* no CAS writeback */)
|
||||
{
|
||||
if (wb->writebacks_active >= client_max_writeback_iodepth)
|
||||
@@ -603,7 +678,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
}
|
||||
// Just copy and acknowledge the operation
|
||||
wb->copy_write(op, CACHE_DIRTY);
|
||||
while (wb->writeback_bytes + op->len > client_max_buffered_bytes || wb->writeback_queue_size > client_max_buffered_ops)
|
||||
while (wb->writeback_bytes > client_max_buffered_bytes || wb->writeback_queue_size > client_max_buffered_ops)
|
||||
{
|
||||
// Initiate some writeback (asynchronously)
|
||||
wb->start_writebacks(this, 1);
|
||||
@@ -613,7 +688,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
cb(op);
|
||||
return;
|
||||
}
|
||||
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
{
|
||||
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
|
||||
{
|
||||
@@ -633,7 +708,10 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
};
|
||||
execute_internal(sync_op);
|
||||
}
|
||||
dirty_bytes += op->len;
|
||||
if (op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
dirty_bytes += op->len;
|
||||
}
|
||||
dirty_ops++;
|
||||
}
|
||||
else if (op->opcode == OSD_OP_SYNC)
|
||||
@@ -718,6 +796,36 @@ bool cluster_client_t::check_rw(cluster_op_t *op)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
op->deoptimise_snapshot = false;
|
||||
if (enable_writeback && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP))
|
||||
{
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end())
|
||||
{
|
||||
int chain_size = 0;
|
||||
while (ino_it != st_cli.inode_config.end() && ino_it->second.parent_id)
|
||||
{
|
||||
// Check for loops - FIXME check it in etcd_state_client
|
||||
if (ino_it->second.parent_id == op->inode ||
|
||||
chain_size > st_cli.inode_config.size())
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
auto cb = std::move(op->callback);
|
||||
cb(op);
|
||||
return false;
|
||||
}
|
||||
if (INODE_POOL(ino_it->second.parent_id) == INODE_POOL(ino_it->first) &&
|
||||
wb->has_inode(ino_it->second.parent_id))
|
||||
{
|
||||
// Deoptimise reads - we have dirty data for one of the parent layer(s).
|
||||
op->deoptimise_snapshot = true;
|
||||
break;
|
||||
}
|
||||
chain_size++;
|
||||
ino_it = st_cli.inode_config.find(ino_it->second.parent_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -777,9 +885,48 @@ resume_1:
|
||||
{
|
||||
if (!(op->parts[i].flags & PART_SENT))
|
||||
{
|
||||
if (!try_send(op, i))
|
||||
int is_ok = try_send(op, i);
|
||||
if (is_ok != TRY_SEND_OK)
|
||||
{
|
||||
// We'll need to retry again
|
||||
if (op->flags & OSD_OP_WAIT_UP_TIMEOUT)
|
||||
{
|
||||
if (is_ok != TRY_SEND_OFFLINE)
|
||||
{
|
||||
// Reset "wait_up" timer
|
||||
op->wait_up_until = {};
|
||||
}
|
||||
else if (!op->wait_up_until.tv_sec && !client_wait_up_timeout)
|
||||
{
|
||||
// Don't wait for the PG to come up at all and fail
|
||||
op->parts[i].flags |= PART_ERROR;
|
||||
if (!op->retval)
|
||||
op->retval = -ETIMEDOUT;
|
||||
break;
|
||||
}
|
||||
else if (!op->wait_up_until.tv_sec)
|
||||
{
|
||||
// Set "wait_up" timer
|
||||
clock_gettime(CLOCK_REALTIME, &op->wait_up_until);
|
||||
op->wait_up_until.tv_sec += client_wait_up_timeout;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check if the timeout expired
|
||||
timespec tv;
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
if (tv.tv_sec > op->wait_up_until.tv_sec ||
|
||||
tv.tv_sec == op->wait_up_until.tv_sec &&
|
||||
tv.tv_nsec > op->wait_up_until.tv_nsec)
|
||||
{
|
||||
// Fail
|
||||
op->parts[i].flags |= PART_ERROR;
|
||||
if (!op->retval)
|
||||
op->retval = -ETIMEDOUT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op->parts[i].flags & PART_RETRY)
|
||||
{
|
||||
op->retry_after = client_retry_interval;
|
||||
@@ -810,12 +957,21 @@ resume_2:
|
||||
{
|
||||
// Check parent inode
|
||||
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||
while (ino_it != st_cli.inode_config.end() && ino_it->second.parent_id &&
|
||||
INODE_POOL(ino_it->second.parent_id) == INODE_POOL(op->cur_inode) &&
|
||||
// Check for loops
|
||||
ino_it->second.parent_id != op->inode)
|
||||
// Skip parents from the same pool
|
||||
int skipped = 0;
|
||||
while (!op->deoptimise_snapshot &&
|
||||
ino_it != st_cli.inode_config.end() && ino_it->second.parent_id &&
|
||||
INODE_POOL(ino_it->second.parent_id) == INODE_POOL(op->cur_inode))
|
||||
{
|
||||
// Skip parents from the same pool
|
||||
// Check for loops - FIXME check it in etcd_state_client
|
||||
if (ino_it->second.parent_id == op->inode ||
|
||||
skipped > st_cli.inode_config.size())
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
skipped++;
|
||||
ino_it = st_cli.inode_config.find(ino_it->second.parent_id);
|
||||
}
|
||||
if (ino_it != st_cli.inode_config.end() &&
|
||||
@@ -994,7 +1150,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
if (end == begin)
|
||||
{
|
||||
op->done_count++;
|
||||
op->parts[i].flags = PART_DONE;
|
||||
op->parts[i].flags = PART_SENT|PART_DONE;
|
||||
}
|
||||
}
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
@@ -1053,7 +1209,7 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
int cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
{
|
||||
if (!msgr_initialized)
|
||||
{
|
||||
@@ -1077,9 +1233,9 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||
);
|
||||
uint64_t meta_rev = 0;
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE && !op->deoptimise_snapshot)
|
||||
{
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||
if (ino_it != st_cli.inode_config.end())
|
||||
meta_rev = ino_it->second.mod_revision;
|
||||
}
|
||||
@@ -1109,14 +1265,15 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
};
|
||||
part->op.iov = part->iov;
|
||||
msgr.outbox_push(&part->op);
|
||||
return true;
|
||||
return TRY_SEND_OK;
|
||||
}
|
||||
else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
|
||||
{
|
||||
msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
|
||||
return TRY_SEND_CONNECTING;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return TRY_SEND_OFFLINE;
|
||||
}
|
||||
|
||||
int cluster_client_t::continue_sync(cluster_op_t *op)
|
||||
@@ -1188,13 +1345,12 @@ resume_1:
|
||||
|
||||
void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
||||
{
|
||||
auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
|
||||
assert(peer_it != msgr.osd_peer_fds.end());
|
||||
auto peer_fd = msgr.osd_peer_fds.at(part->osd_num);
|
||||
part->flags |= PART_SENT;
|
||||
op->inflight_count++;
|
||||
part->op = (osd_op_t){
|
||||
.op_type = OSD_OP_OUT,
|
||||
.peer_fd = peer_it->second,
|
||||
.peer_fd = peer_fd,
|
||||
.req = {
|
||||
.hdr = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
@@ -1228,9 +1384,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
{
|
||||
// Operation failed, retry
|
||||
part->flags |= PART_ERROR;
|
||||
if (!op->retval || op->retval == -EPIPE || part->op.reply.hdr.retval == -EIO)
|
||||
if (!op->retval || op->retval == -EPIPE ||
|
||||
part->op.reply.hdr.retval == -ENOSPC && op->retval == -ETIMEDOUT ||
|
||||
part->op.reply.hdr.retval == -EIO)
|
||||
{
|
||||
// Error priority: EIO > ENOSPC > EPIPE
|
||||
// Error priority: EIO > ENOSPC > ETIMEDOUT > EPIPE
|
||||
op->retval = part->op.reply.hdr.retval;
|
||||
}
|
||||
int stop_fd = -1;
|
||||
@@ -1293,7 +1451,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
}
|
||||
}
|
||||
else if (op->opcode == OSD_OP_WRITE)
|
||||
else if (op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE)
|
||||
{
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
}
|
||||
|
@@ -11,12 +11,11 @@
|
||||
#define DEFAULT_CLIENT_MAX_BUFFERED_BYTES 32*1024*1024
|
||||
#define DEFAULT_CLIENT_MAX_BUFFERED_OPS 1024
|
||||
#define DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH 256
|
||||
#define INODE_LIST_DONE 1
|
||||
#define INODE_LIST_HAS_UNSTABLE 2
|
||||
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
||||
#define OSD_OP_READ_CHAIN_BITMAP 0x102
|
||||
|
||||
#define OSD_OP_IGNORE_READONLY 0x08
|
||||
#define OSD_OP_WAIT_UP_TIMEOUT 0x10
|
||||
|
||||
struct cluster_op_t;
|
||||
|
||||
@@ -41,7 +40,8 @@ struct cluster_op_t
|
||||
// for reads and writes within a single object (stripe),
|
||||
// reads can return current version and writes can use "CAS" semantics
|
||||
uint64_t version = 0;
|
||||
// now only OSD_OP_IGNORE_READONLY is supported
|
||||
// flags: OSD_OP_IGNORE_READONLY - ignore inode readonly flag
|
||||
// OSD_OP_WAIT_UP_TIMEOUT - do not retry the operation infinitely if PG is inactive, only for for <wait_up_timeout>
|
||||
uint64_t flags = 0;
|
||||
// negative retval is an error number
|
||||
// write and read return len on success
|
||||
@@ -53,12 +53,18 @@ struct cluster_op_t
|
||||
void *bitmap_buf = NULL;
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
~cluster_op_t();
|
||||
|
||||
// for deletions, remove after 'atomic delete':
|
||||
bool support_left_on_dead();
|
||||
std::vector<osd_num_t> get_left_on_dead();
|
||||
protected:
|
||||
int state = 0;
|
||||
uint64_t cur_inode; // for snapshot reads
|
||||
bool needs_reslice = false;
|
||||
bool needs_reslice: 1;
|
||||
bool deoptimise_snapshot: 1;
|
||||
int retry_after = 0;
|
||||
int inflight_count = 0, done_count = 0;
|
||||
timespec wait_up_until = {};
|
||||
std::vector<cluster_op_part_t> parts;
|
||||
void *part_bitmaps = NULL;
|
||||
unsigned bitmap_buf_size = 0;
|
||||
@@ -71,6 +77,7 @@ protected:
|
||||
|
||||
struct inode_list_t;
|
||||
struct inode_list_osd_t;
|
||||
struct inode_list_pg_t;
|
||||
class writeback_cache_t;
|
||||
|
||||
// FIXME: Split into public and private interfaces
|
||||
@@ -95,8 +102,9 @@ class cluster_client_t
|
||||
int client_retry_interval = 50; // ms
|
||||
int client_eio_retry_interval = 1000; // ms
|
||||
bool client_retry_enospc = true;
|
||||
int client_wait_up_timeout = 16; // sec (for listings)
|
||||
|
||||
int retry_timeout_id = 0;
|
||||
int retry_timeout_id = -1;
|
||||
int retry_timeout_duration = 0;
|
||||
std::vector<cluster_op_t*> offline_ops;
|
||||
cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
|
||||
@@ -110,6 +118,8 @@ class cluster_client_t
|
||||
bool pgs_loaded = false;
|
||||
ring_consumer_t consumer;
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
int list_retry_timeout_id = -1;
|
||||
timespec list_retry_time;
|
||||
std::vector<inode_list_t*> lists;
|
||||
std::multimap<osd_num_t, osd_op_t*> raw_ops;
|
||||
int continuing_ops = 0;
|
||||
@@ -135,11 +145,10 @@ public:
|
||||
bool get_immediate_commit(uint64_t inode);
|
||||
|
||||
void continue_ops(int time_passed = 0);
|
||||
inode_list_t *list_inode_start(inode_t inode,
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
|
||||
int list_pg_count(inode_list_t *lst);
|
||||
const std::vector<osd_num_t> & list_inode_get_inactive_osds(inode_list_t *lst);
|
||||
void list_inode_next(inode_list_t *lst, int next_pgs);
|
||||
|
||||
void list_inode(inode_t inode, uint64_t min_offset, uint64_t max_offset, int max_parallel_pgs, std::function<void(
|
||||
int status, int pgs_left, pg_num_t pg_num, std::set<object_id>&& objects)> pg_callback);
|
||||
|
||||
//inline uint32_t get_bs_bitmap_granularity() { return st_cli.global_bitmap_granularity; }
|
||||
//inline uint64_t get_bs_block_size() { return st_cli.global_block_size; }
|
||||
uint64_t next_op_id();
|
||||
@@ -158,7 +167,7 @@ protected:
|
||||
bool check_rw(cluster_op_t *op);
|
||||
void slice_rw(cluster_op_t *op);
|
||||
void reset_retry_timer(int new_duration);
|
||||
bool try_send(cluster_op_t *op, int i);
|
||||
int try_send(cluster_op_t *op, int i);
|
||||
int continue_sync(cluster_op_t *op);
|
||||
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
||||
void handle_op_part(cluster_op_part_t *part);
|
||||
@@ -167,8 +176,14 @@ protected:
|
||||
void calc_wait(cluster_op_t *op);
|
||||
void inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc);
|
||||
void continue_lists();
|
||||
void continue_listing(inode_list_t *lst);
|
||||
bool continue_listing(inode_list_t *lst);
|
||||
bool restart_listing(inode_list_t* lst);
|
||||
void retry_start_pg_listing(inode_list_pg_t *pg);
|
||||
int start_pg_listing(inode_list_pg_t *pg);
|
||||
void send_list(inode_list_osd_t *cur_list);
|
||||
void set_list_retry_timeout(int ms, timespec new_time);
|
||||
void finish_list_pg(inode_list_pg_t *pg, bool retry_epipe);
|
||||
bool check_finish_listing(inode_list_t *lst);
|
||||
void continue_raw_ops(osd_num_t peer_osd);
|
||||
|
||||
friend class writeback_cache_t;
|
||||
|
@@ -42,10 +42,10 @@ public:
|
||||
std::multimap<uint64_t, uint64_t*> flushed_buffers; // flush_id => refcnt
|
||||
|
||||
~writeback_cache_t();
|
||||
bool has_inode(uint64_t inode);
|
||||
dirty_buf_it_t find_dirty(uint64_t inode, uint64_t offset);
|
||||
bool is_left_merged(dirty_buf_it_t dirty_it);
|
||||
bool is_right_merged(dirty_buf_it_t dirty_it);
|
||||
bool is_merged(const dirty_buf_it_t & dirty_it);
|
||||
void copy_write(cluster_op_t *op, int state, uint64_t new_flush_id = 0);
|
||||
int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd, pool_id_t pool_id, pg_num_t pg_num);
|
||||
void start_writebacks(cluster_client_t *cli, int count);
|
||||
|
@@ -2,9 +2,17 @@
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <algorithm>
|
||||
#include "assert.h"
|
||||
#include "pg_states.h"
|
||||
#include "cluster_client.h"
|
||||
|
||||
#define LIST_PG_INIT 0
|
||||
#define LIST_PG_WAIT_ACTIVE 1
|
||||
#define LIST_PG_WAIT_CONNECT 2
|
||||
#define LIST_PG_WAIT_RETRY 3
|
||||
#define LIST_PG_SENT 4
|
||||
#define LIST_PG_DONE 5
|
||||
|
||||
struct inode_list_t;
|
||||
|
||||
struct inode_list_pg_t;
|
||||
@@ -13,20 +21,22 @@ struct inode_list_osd_t
|
||||
{
|
||||
inode_list_pg_t *pg = NULL;
|
||||
osd_num_t osd_num = 0;
|
||||
bool sent = false;
|
||||
};
|
||||
|
||||
struct inode_list_pg_t
|
||||
{
|
||||
inode_list_t *lst = NULL;
|
||||
int pos = 0;
|
||||
pg_num_t pg_num;
|
||||
osd_num_t cur_primary;
|
||||
bool has_unstable = false;
|
||||
int sent = 0;
|
||||
int done = 0;
|
||||
int errcode = 0;
|
||||
pg_num_t pg_num = 0;
|
||||
osd_num_t cur_primary = 0;
|
||||
int state = 0;
|
||||
int inflight_ops = 0;
|
||||
timespec wait_until;
|
||||
std::vector<inode_list_osd_t> list_osds;
|
||||
|
||||
bool has_unstable = false;
|
||||
std::set<object_id> objects;
|
||||
std::vector<osd_num_t> inactive_osds;
|
||||
};
|
||||
|
||||
struct inode_list_t
|
||||
@@ -34,175 +44,295 @@ struct inode_list_t
|
||||
cluster_client_t *cli = NULL;
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
uint64_t min_offset = 0;
|
||||
uint64_t max_offset = 0;
|
||||
int max_parallel_pgs = 16;
|
||||
|
||||
bool fallback = false;
|
||||
int inflight_pgs = 0;
|
||||
std::map<osd_num_t, int> inflight_per_osd;
|
||||
int done_pgs = 0;
|
||||
int want = 0;
|
||||
std::vector<osd_num_t> inactive_osds;
|
||||
int onstack = 0;
|
||||
std::vector<inode_list_pg_t*> pgs;
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback;
|
||||
pg_num_t real_pg_count = 0;
|
||||
std::function<void(int status, int pgs_left, pg_num_t pg_num, std::set<object_id>&& objects)> callback;
|
||||
};
|
||||
|
||||
inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback)
|
||||
void cluster_client_t::list_inode(inode_t inode, uint64_t min_offset, uint64_t max_offset, int max_parallel_pgs, std::function<void(
|
||||
int status, int pgs_left, pg_num_t pg_num, std::set<object_id>&& objects)> pg_callback)
|
||||
{
|
||||
init_msgr();
|
||||
int skipped_pgs = 0;
|
||||
pool_id_t pool_id = INODE_POOL(inode);
|
||||
if (!pool_id || st_cli.pool_config.find(pool_id) == st_cli.pool_config.end())
|
||||
{
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
}
|
||||
return NULL;
|
||||
pg_callback(-EINVAL, 0, 0, std::set<object_id>());
|
||||
return;
|
||||
}
|
||||
auto pg_stripe_size = st_cli.pool_config.at(pool_id).pg_stripe_size;
|
||||
if (min_offset)
|
||||
min_offset = (min_offset/pg_stripe_size) * pg_stripe_size;
|
||||
inode_list_t *lst = new inode_list_t();
|
||||
lst->cli = this;
|
||||
lst->pool_id = pool_id;
|
||||
lst->inode = inode;
|
||||
lst->callback = callback;
|
||||
auto pool_cfg = st_cli.pool_config[pool_id];
|
||||
std::set<osd_num_t> inactive_osd_set;
|
||||
for (auto & pg_item: pool_cfg.pg_config)
|
||||
{
|
||||
auto & pg = pg_item.second;
|
||||
if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
|
||||
{
|
||||
skipped_pgs++;
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
inode_list_pg_t *r = new inode_list_pg_t();
|
||||
r->lst = lst;
|
||||
r->pg_num = pg_item.first;
|
||||
r->cur_primary = pg.cur_primary;
|
||||
if (pg.cur_state != PG_ACTIVE)
|
||||
{
|
||||
// Not clean
|
||||
std::set<osd_num_t> all_peers;
|
||||
for (osd_num_t pg_osd: pg.target_set)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (auto & hist_item: pg.target_history)
|
||||
{
|
||||
for (auto pg_osd: hist_item)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
|
||||
{
|
||||
r->list_osds.push_back((inode_list_osd_t){
|
||||
.pg = r,
|
||||
.osd_num = peer_osd,
|
||||
.sent = false,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
inactive_osd_set.insert(peer_osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Clean
|
||||
r->list_osds.push_back((inode_list_osd_t){
|
||||
.pg = r,
|
||||
.osd_num = pg.cur_primary,
|
||||
.sent = false,
|
||||
});
|
||||
}
|
||||
lst->pgs.push_back(r);
|
||||
}
|
||||
std::sort(lst->pgs.begin(), lst->pgs.end(), [](inode_list_pg_t *a, inode_list_pg_t *b)
|
||||
{
|
||||
return a->cur_primary < b->cur_primary ? true : false;
|
||||
});
|
||||
for (int i = 0; i < lst->pgs.size(); i++)
|
||||
{
|
||||
lst->pgs[i]->pos = i;
|
||||
}
|
||||
lst->inactive_osds.insert(lst->inactive_osds.end(), inactive_osd_set.begin(), inactive_osd_set.end());
|
||||
lst->min_offset = min_offset;
|
||||
lst->max_offset = max_offset;
|
||||
lst->callback = pg_callback;
|
||||
lst->max_parallel_pgs = max_parallel_pgs <= 0 ? 16 : max_parallel_pgs;
|
||||
lists.push_back(lst);
|
||||
return lst;
|
||||
}
|
||||
|
||||
int cluster_client_t::list_pg_count(inode_list_t *lst)
|
||||
{
|
||||
return lst->pgs.size();
|
||||
}
|
||||
|
||||
const std::vector<osd_num_t> & cluster_client_t::list_inode_get_inactive_osds(inode_list_t *lst)
|
||||
{
|
||||
return lst->inactive_osds;
|
||||
}
|
||||
|
||||
void cluster_client_t::list_inode_next(inode_list_t *lst, int next_pgs)
|
||||
{
|
||||
if (next_pgs >= 0)
|
||||
{
|
||||
lst->want += next_pgs;
|
||||
}
|
||||
continue_listing(lst);
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_listing(inode_list_t *lst)
|
||||
bool cluster_client_t::continue_listing(inode_list_t *lst)
|
||||
{
|
||||
if (lst->done_pgs >= lst->pgs.size())
|
||||
if (lst->onstack > 0)
|
||||
{
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
if (lst->want <= 0)
|
||||
lst->onstack++;
|
||||
if (restart_listing(lst))
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lst->pgs.size(); i++)
|
||||
{
|
||||
if (lst->pgs[i] && lst->pgs[i]->sent < lst->pgs[i]->list_osds.size())
|
||||
for (int i = 0; i < lst->pgs.size() && lst->inflight_pgs < lst->max_parallel_pgs; i++)
|
||||
{
|
||||
for (int j = 0; j < lst->pgs[i]->list_osds.size(); j++)
|
||||
retry_start_pg_listing(lst->pgs[i]);
|
||||
}
|
||||
}
|
||||
if (check_finish_listing(lst))
|
||||
{
|
||||
// Do not change lst->onstack because it's already freed
|
||||
return false;
|
||||
}
|
||||
lst->onstack--;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cluster_client_t::restart_listing(inode_list_t* lst)
|
||||
{
|
||||
auto pool_it = st_cli.pool_config.find(lst->pool_id);
|
||||
// We want listing to be consistent. To achieve it we should:
|
||||
// 1) retry listing of each PG if its state changes
|
||||
// 2) abort listing if PG count changes during listing
|
||||
// 3) ideally, only talk to the primary OSD - this will be done separately
|
||||
// So first we add all PGs without checking their state
|
||||
if (pool_it == st_cli.pool_config.end() ||
|
||||
lst->real_pg_count != pool_it->second.real_pg_count)
|
||||
{
|
||||
for (auto pg: lst->pgs)
|
||||
{
|
||||
if (pg->inflight_ops > 0)
|
||||
{
|
||||
send_list(&lst->pgs[i]->list_osds[j]);
|
||||
if (lst->want <= 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
// Wait until all in-progress listings complete or fail
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (auto pg: lst->pgs)
|
||||
{
|
||||
delete pg;
|
||||
}
|
||||
if (log_level > 0 && lst->real_pg_count)
|
||||
{
|
||||
fprintf(stderr, "PG count in pool %u changed during listing\n", lst->pool_id);
|
||||
}
|
||||
lst->pgs.clear();
|
||||
if (pool_it == st_cli.pool_config.end())
|
||||
{
|
||||
// Unknown pool
|
||||
lst->callback(-EINVAL, 0, 0, std::set<object_id>());
|
||||
return false;
|
||||
}
|
||||
else if (lst->done_pgs)
|
||||
{
|
||||
// PG count changed during listing, it should fail
|
||||
lst->callback(-EAGAIN, 0, 0, std::set<object_id>());
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
lst->real_pg_count = pool_it->second.real_pg_count;
|
||||
for (pg_num_t pg_num = 1; pg_num <= lst->real_pg_count; pg_num++)
|
||||
{
|
||||
inode_list_pg_t *pg = new inode_list_pg_t();
|
||||
pg->lst = lst;
|
||||
pg->pg_num = pg_num;
|
||||
lst->pgs.push_back(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void cluster_client_t::retry_start_pg_listing(inode_list_pg_t *pg)
|
||||
{
|
||||
if (pg->state == LIST_PG_SENT || pg->state == LIST_PG_DONE)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (pg->state == LIST_PG_WAIT_RETRY)
|
||||
{
|
||||
// Check if the timeout expired
|
||||
timespec tv;
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
if (tv.tv_sec < pg->wait_until.tv_sec ||
|
||||
tv.tv_sec == pg->wait_until.tv_sec && tv.tv_nsec < pg->wait_until.tv_nsec)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
int new_st = start_pg_listing(pg);
|
||||
if (new_st == LIST_PG_SENT || new_st == LIST_PG_WAIT_CONNECT)
|
||||
{
|
||||
// sent => wait for completion
|
||||
// not connected, but OSD state exists => wait for PG or OSD state change infinitely
|
||||
pg->state = new_st;
|
||||
return;
|
||||
}
|
||||
if (new_st == LIST_PG_WAIT_ACTIVE && pg->state != LIST_PG_WAIT_ACTIVE)
|
||||
{
|
||||
if (!client_wait_up_timeout)
|
||||
{
|
||||
fprintf(stderr, "PG %u/%u is inactive, skipping listing\n", pg->lst->pool_id, pg->pg_num);
|
||||
pg->errcode = -EPIPE;
|
||||
pg->list_osds.clear();
|
||||
pg->objects.clear();
|
||||
finish_list_pg(pg, false);
|
||||
return;
|
||||
}
|
||||
pg->state = new_st;
|
||||
clock_gettime(CLOCK_REALTIME, &pg->wait_until);
|
||||
pg->wait_until.tv_sec += client_wait_up_timeout;
|
||||
if (log_level > 1)
|
||||
{
|
||||
fprintf(stderr, "Waiting for PG %u/%u to become active for %d seconds\n", pg->lst->pool_id, pg->pg_num, client_wait_up_timeout);
|
||||
}
|
||||
set_list_retry_timeout(client_wait_up_timeout*1000, pg->wait_until);
|
||||
return;
|
||||
}
|
||||
assert(pg->state == LIST_PG_WAIT_ACTIVE);
|
||||
// Check if the timeout expired
|
||||
timespec tv;
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
if (tv.tv_sec > pg->wait_until.tv_sec ||
|
||||
tv.tv_sec == pg->wait_until.tv_sec && tv.tv_nsec >= pg->wait_until.tv_nsec)
|
||||
{
|
||||
fprintf(stderr, "Failed to wait for PG %u/%u to become active, skipping listing\n", pg->lst->pool_id, pg->pg_num);
|
||||
pg->errcode = -EPIPE;
|
||||
pg->list_osds.clear();
|
||||
pg->objects.clear();
|
||||
finish_list_pg(pg, false);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::set_list_retry_timeout(int ms, timespec new_time)
|
||||
{
|
||||
if (!list_retry_time.tv_sec || list_retry_time.tv_sec > new_time.tv_sec ||
|
||||
list_retry_time.tv_sec == new_time.tv_sec && list_retry_time.tv_nsec > new_time.tv_nsec)
|
||||
{
|
||||
list_retry_time = new_time;
|
||||
if (list_retry_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(list_retry_timeout_id);
|
||||
}
|
||||
list_retry_timeout_id = tfd->set_timer(ms, false, [this](int timer_id)
|
||||
{
|
||||
list_retry_timeout_id = -1;
|
||||
list_retry_time = {};
|
||||
continue_lists();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
int cluster_client_t::start_pg_listing(inode_list_pg_t *pg)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(pg->lst->pool_id);
|
||||
auto pg_it = pool_cfg.pg_config.find(pg->pg_num);
|
||||
assert(pg->lst->real_pg_count == pool_cfg.real_pg_count);
|
||||
if (pg_it == pool_cfg.pg_config.end() ||
|
||||
pg_it->second.pause ||
|
||||
!pg_it->second.cur_primary ||
|
||||
!(pg_it->second.cur_state & PG_ACTIVE))
|
||||
{
|
||||
// PG is (temporarily?) unavailable
|
||||
return LIST_PG_WAIT_ACTIVE;
|
||||
}
|
||||
pg->inactive_osds.clear();
|
||||
std::set<osd_num_t> all_peers;
|
||||
if (pg_it->second.cur_state != PG_ACTIVE && pg->lst->fallback)
|
||||
{
|
||||
// Not clean and OSDs don't support listing from primary
|
||||
for (osd_num_t pg_osd: pg_it->second.target_set)
|
||||
all_peers.insert(pg_osd);
|
||||
for (osd_num_t pg_osd: pg_it->second.all_peers)
|
||||
all_peers.insert(pg_osd);
|
||||
for (auto & hist_item: pg_it->second.target_history)
|
||||
for (auto pg_osd: hist_item)
|
||||
all_peers.insert(pg_osd);
|
||||
// Remove zero OSD number
|
||||
all_peers.erase(0);
|
||||
// Remove unconnectable peers except cur_primary
|
||||
for (auto peer_it = all_peers.begin(); peer_it != all_peers.end(); )
|
||||
{
|
||||
if (*peer_it != pg_it->second.cur_primary &&
|
||||
st_cli.peer_states[*peer_it].is_null())
|
||||
{
|
||||
pg->inactive_osds.push_back(*peer_it);
|
||||
all_peers.erase(peer_it++);
|
||||
}
|
||||
else
|
||||
peer_it++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Clean
|
||||
all_peers.insert(pg_it->second.cur_primary);
|
||||
}
|
||||
// Check that we're connected to all PG OSDs
|
||||
bool conn = true;
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
if (msgr.osd_peer_fds.find(peer_osd) == msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
if (st_cli.peer_states[peer_osd].is_null())
|
||||
{
|
||||
return LIST_PG_WAIT_ACTIVE;
|
||||
}
|
||||
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||
conn = false;
|
||||
}
|
||||
}
|
||||
if (!conn)
|
||||
{
|
||||
return LIST_PG_WAIT_CONNECT;
|
||||
}
|
||||
// Send all listings at once as the simplest way to guarantee that we connect
|
||||
// to the exact same OSDs that are listed in PG state
|
||||
pg->errcode = 0;
|
||||
pg->list_osds.clear();
|
||||
pg->has_unstable = false;
|
||||
pg->objects.clear();
|
||||
pg->cur_primary = pg_it->second.cur_primary;
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
pg->list_osds.push_back((inode_list_osd_t){
|
||||
.pg = pg,
|
||||
.osd_num = peer_osd,
|
||||
});
|
||||
}
|
||||
for (auto & list_osd: pg->list_osds)
|
||||
{
|
||||
send_list(&list_osd);
|
||||
}
|
||||
return LIST_PG_SENT;
|
||||
}
|
||||
|
||||
void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
{
|
||||
if (cur_list->sent)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (msgr.osd_peer_fds.find(cur_list->osd_num) == msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
msgr.connect_peer(cur_list->osd_num, st_cli.peer_states[cur_list->osd_num]);
|
||||
return;
|
||||
}
|
||||
if (!cur_list->pg->inflight_ops)
|
||||
cur_list->pg->lst->inflight_pgs++;
|
||||
cur_list->pg->inflight_ops++;
|
||||
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
@@ -220,6 +350,9 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
.pg_stripe_size = pool_cfg.pg_stripe_size,
|
||||
.min_inode = cur_list->pg->lst->inode,
|
||||
.max_inode = cur_list->pg->lst->inode,
|
||||
.min_stripe = cur_list->pg->lst->min_offset,
|
||||
.max_stripe = cur_list->pg->lst->max_offset,
|
||||
.flags = (uint64_t)(cur_list->pg->lst->fallback ? 0 : OSD_LIST_PRIMARY),
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
@@ -228,6 +361,29 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
{
|
||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %ju (retval=%jd), skipping\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||
if (!cur_list->pg->errcode ||
|
||||
cur_list->pg->errcode == -EPIPE ||
|
||||
op->reply.hdr.retval != -EPIPE)
|
||||
{
|
||||
cur_list->pg->errcode = op->reply.hdr.retval;
|
||||
}
|
||||
}
|
||||
else if ((op->req.sec_list.flags & OSD_LIST_PRIMARY) &&
|
||||
!(op->reply.sec_list.flags & OSD_LIST_PRIMARY))
|
||||
{
|
||||
// OSD is old and doesn't support listing from primary
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "[PG %u/%u] Primary OSD doesn't support consistent listings, falling back to listings from all peers\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num
|
||||
);
|
||||
}
|
||||
cur_list->pg->lst->fallback = true;
|
||||
if (!cur_list->pg->errcode)
|
||||
{
|
||||
cur_list->pg->errcode = -EPIPE;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -256,55 +412,66 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
auto lst = cur_list->pg->lst;
|
||||
auto pg = cur_list->pg;
|
||||
pg->done++;
|
||||
if (pg->done >= pg->list_osds.size())
|
||||
{
|
||||
int status = 0;
|
||||
lst->done_pgs++;
|
||||
if (lst->done_pgs >= lst->pgs.size())
|
||||
{
|
||||
status |= INODE_LIST_DONE;
|
||||
}
|
||||
if (pg->has_unstable)
|
||||
{
|
||||
status |= INODE_LIST_HAS_UNSTABLE;
|
||||
}
|
||||
lst->callback(lst, std::move(pg->objects), pg->pg_num, pg->cur_primary, status);
|
||||
lst->pgs[pg->pos] = NULL;
|
||||
delete pg;
|
||||
if (lst->done_pgs >= lst->pgs.size())
|
||||
{
|
||||
// All done
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i] == lst)
|
||||
{
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete lst;
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
lst->want++;
|
||||
}
|
||||
continue_listing(lst);
|
||||
cur_list->pg->inflight_ops--;
|
||||
if (!cur_list->pg->inflight_ops)
|
||||
cur_list->pg->lst->inflight_pgs--;
|
||||
finish_list_pg(cur_list->pg, true);
|
||||
continue_listing(cur_list->pg->lst);
|
||||
};
|
||||
msgr.outbox_push(op);
|
||||
cur_list->sent = true;
|
||||
cur_list->pg->sent++;
|
||||
cur_list->pg->lst->want--;
|
||||
}
|
||||
|
||||
void cluster_client_t::finish_list_pg(inode_list_pg_t *pg, bool retry_epipe)
|
||||
{
|
||||
auto lst = pg->lst;
|
||||
if (pg->inflight_ops == 0)
|
||||
{
|
||||
if (pg->errcode == -EPIPE && retry_epipe)
|
||||
{
|
||||
// Retry listing after <client_retry_interval> ms on EPIPE
|
||||
pg->state = LIST_PG_WAIT_RETRY;
|
||||
clock_gettime(CLOCK_REALTIME, &pg->wait_until);
|
||||
pg->wait_until.tv_nsec += client_retry_interval*1000000;
|
||||
pg->wait_until.tv_sec += (pg->wait_until.tv_nsec / 1000000000);
|
||||
pg->wait_until.tv_nsec = (pg->wait_until.tv_nsec % 1000000000);
|
||||
set_list_retry_timeout(client_retry_interval, pg->wait_until);
|
||||
return;
|
||||
}
|
||||
lst->done_pgs++;
|
||||
pg->state = LIST_PG_DONE;
|
||||
lst->callback(pg->errcode, lst->pgs.size()-lst->done_pgs, pg->pg_num, std::move(pg->objects));
|
||||
pg->objects.clear();
|
||||
pg->inactive_osds.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_lists()
|
||||
{
|
||||
for (auto lst: lists)
|
||||
for (int i = lists.size()-1; i >= 0; i--)
|
||||
{
|
||||
continue_listing(lst);
|
||||
continue_listing(lists[i]);
|
||||
}
|
||||
}
|
||||
|
||||
bool cluster_client_t::check_finish_listing(inode_list_t *lst)
|
||||
{
|
||||
if (lst->done_pgs >= lst->pgs.size())
|
||||
{
|
||||
for (auto pg: lst->pgs)
|
||||
{
|
||||
delete pg;
|
||||
}
|
||||
lst->pgs.clear();
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i] == lst)
|
||||
{
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete lst;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@@ -9,7 +9,7 @@ writeback_cache_t::~writeback_cache_t()
|
||||
{
|
||||
for (auto & bp: dirty_buffers)
|
||||
{
|
||||
if (!--(*bp.second.refcnt))
|
||||
if (bp.second.buf && !--(*bp.second.refcnt))
|
||||
{
|
||||
free(bp.second.refcnt); // refcnt is allocated with the buffer
|
||||
}
|
||||
@@ -17,6 +17,15 @@ writeback_cache_t::~writeback_cache_t()
|
||||
dirty_buffers.clear();
|
||||
}
|
||||
|
||||
bool writeback_cache_t::has_inode(uint64_t inode)
|
||||
{
|
||||
auto dirty_it = dirty_buffers.lower_bound((object_id){
|
||||
.inode = inode,
|
||||
.stripe = 0,
|
||||
});
|
||||
return dirty_it != dirty_buffers.end() && dirty_it->first.inode == inode;
|
||||
}
|
||||
|
||||
dirty_buf_it_t writeback_cache_t::find_dirty(uint64_t inode, uint64_t offset)
|
||||
{
|
||||
auto dirty_it = dirty_buffers.lower_bound((object_id){
|
||||
@@ -33,7 +42,11 @@ dirty_buf_it_t writeback_cache_t::find_dirty(uint64_t inode, uint64_t offset)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return dirty_it;
|
||||
if (dirty_it != dirty_buffers.end() && dirty_it->first.inode == inode)
|
||||
{
|
||||
return dirty_it;
|
||||
}
|
||||
return dirty_buffers.end();
|
||||
}
|
||||
|
||||
bool writeback_cache_t::is_left_merged(dirty_buf_it_t dirty_it)
|
||||
@@ -43,6 +56,7 @@ bool writeback_cache_t::is_left_merged(dirty_buf_it_t dirty_it)
|
||||
auto prev_it = dirty_it;
|
||||
prev_it--;
|
||||
if (prev_it->first.inode == dirty_it->first.inode &&
|
||||
(prev_it->second.buf != NULL) == (dirty_it->second.buf != NULL) &&
|
||||
prev_it->first.stripe+prev_it->second.len == dirty_it->first.stripe &&
|
||||
prev_it->second.state == CACHE_DIRTY)
|
||||
{
|
||||
@@ -58,6 +72,7 @@ bool writeback_cache_t::is_right_merged(dirty_buf_it_t dirty_it)
|
||||
next_it++;
|
||||
if (next_it != dirty_buffers.end() &&
|
||||
next_it->first.inode == dirty_it->first.inode &&
|
||||
(next_it->second.buf != NULL) == (dirty_it->second.buf != NULL) &&
|
||||
next_it->first.stripe == dirty_it->first.stripe+dirty_it->second.len &&
|
||||
next_it->second.state == CACHE_DIRTY)
|
||||
{
|
||||
@@ -66,11 +81,6 @@ bool writeback_cache_t::is_right_merged(dirty_buf_it_t dirty_it)
|
||||
return false;
|
||||
}
|
||||
|
||||
bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
|
||||
{
|
||||
return is_left_merged(dirty_it) || is_right_merged(dirty_it);
|
||||
}
|
||||
|
||||
void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flush_id)
|
||||
{
|
||||
// Save operation for replay when one of PGs goes out of sync
|
||||
@@ -99,16 +109,22 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
.inode = op->inode,
|
||||
.stripe = new_end,
|
||||
}, (cluster_buffer_t){
|
||||
.buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
|
||||
.buf = dirty_it->second.buf ? dirty_it->second.buf + new_end - dirty_it->first.stripe : NULL,
|
||||
.len = old_end - new_end,
|
||||
.state = dirty_it->second.state,
|
||||
.flush_id = dirty_it->second.flush_id,
|
||||
.refcnt = dirty_it->second.refcnt,
|
||||
});
|
||||
(*dirty_it->second.refcnt)++;
|
||||
if (dirty_it->second.buf)
|
||||
{
|
||||
(*dirty_it->second.refcnt)++;
|
||||
}
|
||||
if (dirty_it->second.state == CACHE_DIRTY)
|
||||
{
|
||||
writeback_bytes -= op->len;
|
||||
if (dirty_it->second.buf)
|
||||
{
|
||||
writeback_bytes -= op->len;
|
||||
}
|
||||
writeback_queue_size++;
|
||||
}
|
||||
break;
|
||||
@@ -118,8 +134,11 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
// Only leave the beginning
|
||||
if (dirty_it->second.state == CACHE_DIRTY)
|
||||
{
|
||||
writeback_bytes -= old_end - op->offset;
|
||||
if (is_left_merged(dirty_it) && !is_right_merged(dirty_it))
|
||||
if (dirty_it->second.buf)
|
||||
{
|
||||
writeback_bytes -= old_end - op->offset;
|
||||
}
|
||||
if (is_right_merged(dirty_it))
|
||||
{
|
||||
writeback_queue_size++;
|
||||
}
|
||||
@@ -133,8 +152,11 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
// Only leave the end
|
||||
if (dirty_it->second.state == CACHE_DIRTY)
|
||||
{
|
||||
writeback_bytes -= new_end - dirty_it->first.stripe;
|
||||
if (!is_left_merged(dirty_it) && is_right_merged(dirty_it))
|
||||
if (dirty_it->second.buf)
|
||||
{
|
||||
writeback_bytes -= new_end - dirty_it->first.stripe;
|
||||
}
|
||||
if (is_left_merged(dirty_it))
|
||||
{
|
||||
writeback_queue_size++;
|
||||
}
|
||||
@@ -143,7 +165,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
.inode = op->inode,
|
||||
.stripe = new_end,
|
||||
}, (cluster_buffer_t){
|
||||
.buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
|
||||
.buf = dirty_it->second.buf ? dirty_it->second.buf + new_end - dirty_it->first.stripe : NULL,
|
||||
.len = old_end - new_end,
|
||||
.state = dirty_it->second.state,
|
||||
.flush_id = dirty_it->second.flush_id,
|
||||
@@ -156,13 +178,25 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
else
|
||||
{
|
||||
// Remove the whole buffer
|
||||
if (dirty_it->second.state == CACHE_DIRTY && !is_merged(dirty_it))
|
||||
if (dirty_it->second.state == CACHE_DIRTY)
|
||||
{
|
||||
writeback_bytes -= dirty_it->second.len;
|
||||
assert(writeback_queue_size > 0);
|
||||
writeback_queue_size--;
|
||||
if (dirty_it->second.buf)
|
||||
{
|
||||
writeback_bytes -= dirty_it->second.len;
|
||||
}
|
||||
bool lm = is_left_merged(dirty_it);
|
||||
bool rm = is_right_merged(dirty_it);
|
||||
if (!lm && !rm)
|
||||
{
|
||||
assert(writeback_queue_size > 0);
|
||||
writeback_queue_size--;
|
||||
}
|
||||
else if (lm && rm)
|
||||
{
|
||||
writeback_queue_size++;
|
||||
}
|
||||
}
|
||||
if (!--(*dirty_it->second.refcnt))
|
||||
if (dirty_it->second.buf && !--(*dirty_it->second.refcnt))
|
||||
{
|
||||
free(dirty_it->second.refcnt);
|
||||
}
|
||||
@@ -170,9 +204,13 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
}
|
||||
}
|
||||
// Overlapping buffers are removed, just insert the new one
|
||||
uint64_t *refcnt = (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
|
||||
uint8_t *buf = (uint8_t*)refcnt + sizeof(uint64_t);
|
||||
*refcnt = 1;
|
||||
bool is_del = op->opcode == OSD_OP_DELETE;
|
||||
uint64_t *refcnt = is_del ? NULL : (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
|
||||
uint8_t *buf = is_del ? NULL : ((uint8_t*)refcnt + sizeof(uint64_t));
|
||||
if (!is_del)
|
||||
{
|
||||
*refcnt = 1;
|
||||
}
|
||||
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
|
||||
.inode = op->inode,
|
||||
.stripe = op->offset,
|
||||
@@ -185,9 +223,11 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
});
|
||||
if (state == CACHE_DIRTY)
|
||||
{
|
||||
writeback_bytes += op->len;
|
||||
writeback_bytes += is_del ? 0 : op->len;
|
||||
// Track consecutive write-back operations
|
||||
if (!is_merged(dirty_it))
|
||||
bool lm = is_left_merged(dirty_it);
|
||||
bool rm = is_right_merged(dirty_it);
|
||||
if (!lm && !rm)
|
||||
{
|
||||
// <writeback_queue> is OK to contain more than actual number of consecutive
|
||||
// requests as long as it doesn't miss anything. But <writeback_queue_size>
|
||||
@@ -198,14 +238,22 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
|
||||
.stripe = op->offset,
|
||||
});
|
||||
}
|
||||
else if (lm && rm)
|
||||
{
|
||||
assert(writeback_queue_size > 0);
|
||||
writeback_queue_size--;
|
||||
}
|
||||
}
|
||||
uint64_t pos = 0, len = op->len, iov_idx = 0;
|
||||
while (len > 0 && iov_idx < op->iov.count)
|
||||
if (!is_del)
|
||||
{
|
||||
auto & iov = op->iov.buf[iov_idx];
|
||||
memcpy(buf + pos, iov.iov_base, iov.iov_len);
|
||||
pos += iov.iov_len;
|
||||
iov_idx++;
|
||||
uint64_t pos = 0, len = op->len, iov_idx = 0;
|
||||
while (len > 0 && iov_idx < op->iov.count)
|
||||
{
|
||||
auto & iov = op->iov.buf[iov_idx];
|
||||
memcpy(buf + pos, iov.iov_base, iov.iov_len);
|
||||
pos += iov.iov_len;
|
||||
iov_idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -219,7 +267,7 @@ int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd,
|
||||
for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
|
||||
{
|
||||
bool end = wr_it == dirty_buffers.end();
|
||||
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING;
|
||||
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING && wr_it->second.state != CACHE_DIRTY;
|
||||
if (peer_osd)
|
||||
flush_this = flush_this && cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
|
||||
if (pool_id && pg_num)
|
||||
@@ -250,7 +298,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||
bool is_writeback = from_it->second.state == CACHE_DIRTY;
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
op->opcode = from_it->second.buf ? OSD_OP_WRITE : OSD_OP_DELETE;
|
||||
op->cur_inode = op->inode = from_it->first.inode;
|
||||
op->offset = from_it->first.stripe;
|
||||
op->len = prev_it->first.stripe + prev_it->second.len - from_it->first.stripe;
|
||||
@@ -260,9 +308,12 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||
{
|
||||
it->second.state = CACHE_REPEATING;
|
||||
it->second.flush_id = flush_id;
|
||||
(*it->second.refcnt)++;
|
||||
flushed_buffers.emplace(flush_id, it->second.refcnt);
|
||||
op->iov.push_back(it->second.buf, it->second.len);
|
||||
if (it->second.buf)
|
||||
{
|
||||
(*it->second.refcnt)++;
|
||||
flushed_buffers.emplace(flush_id, it->second.refcnt);
|
||||
op->iov.push_back(it->second.buf, it->second.len);
|
||||
}
|
||||
calc_len += it->second.len;
|
||||
}
|
||||
assert(calc_len == op->len);
|
||||
@@ -334,10 +385,12 @@ void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
|
||||
}
|
||||
auto from_it = dirty_it;
|
||||
uint64_t off = dirty_it->first.stripe;
|
||||
bool is_del = (dirty_it->second.buf == NULL);
|
||||
while (from_it != dirty_buffers.begin())
|
||||
{
|
||||
from_it--;
|
||||
if (from_it->second.state != CACHE_DIRTY ||
|
||||
(from_it->second.buf == NULL) != is_del ||
|
||||
from_it->first.inode != req.inode ||
|
||||
from_it->first.stripe+from_it->second.len != off)
|
||||
{
|
||||
@@ -352,6 +405,7 @@ void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
|
||||
while (to_it != dirty_buffers.end())
|
||||
{
|
||||
if (to_it->second.state != CACHE_DIRTY ||
|
||||
(to_it->second.buf == NULL) != is_del ||
|
||||
to_it->first.inode != req.inode ||
|
||||
to_it->first.stripe != off)
|
||||
{
|
||||
@@ -364,6 +418,7 @@ void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
|
||||
assert(writeback_queue_size > 0);
|
||||
writeback_queue_size--;
|
||||
writeback_bytes -= off - from_it->first.stripe;
|
||||
assert(writeback_queue_size > 0 || !writeback_bytes);
|
||||
flush_buffers(cli, from_it, to_it);
|
||||
}
|
||||
queue_copy.erase(queue_copy.begin(), queue_copy.begin()+i);
|
||||
@@ -391,15 +446,27 @@ static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t
|
||||
auto & v = op->iov.buf[iov_idx];
|
||||
auto begin = (cur_offset < offset ? offset : cur_offset);
|
||||
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
|
||||
memcpy(
|
||||
(uint8_t*)v.iov_base + begin - cur_offset,
|
||||
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
|
||||
end - begin
|
||||
);
|
||||
if (!buf)
|
||||
{
|
||||
memset((uint8_t*)v.iov_base + begin - cur_offset, 0, end - begin);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(
|
||||
(uint8_t*)v.iov_base + begin - cur_offset,
|
||||
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
|
||||
end - begin
|
||||
);
|
||||
}
|
||||
cur_offset += v.iov_len;
|
||||
iov_idx++;
|
||||
}
|
||||
}
|
||||
if (!buf)
|
||||
{
|
||||
// Bitmap is initially zero, don't set it
|
||||
return;
|
||||
}
|
||||
// Set bitmap bits
|
||||
int start_bit = (offset-op->offset)/bitmap_granularity;
|
||||
int end_bit = (offset-op->offset+len)/bitmap_granularity;
|
||||
@@ -449,7 +516,8 @@ bool writeback_cache_t::read_from_cache(cluster_op_t *op, uint32_t bitmap_granul
|
||||
{
|
||||
// Copy data
|
||||
dirty_copied = true;
|
||||
copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
|
||||
copy_to_op(op, prev, dirty_it->second.buf ? (dirty_it->second.buf + prev - dirty_it->first.stripe) : NULL,
|
||||
cur-prev, bitmap_granularity);
|
||||
}
|
||||
skip_prev = skip;
|
||||
prev = cur;
|
||||
@@ -461,7 +529,8 @@ bool writeback_cache_t::read_from_cache(cluster_op_t *op, uint32_t bitmap_granul
|
||||
{
|
||||
// Copy data
|
||||
dirty_copied = true;
|
||||
copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
|
||||
copy_to_op(op, prev, dirty_it->second.buf ? (dirty_it->second.buf + prev - dirty_it->first.stripe) : NULL,
|
||||
cur-prev, bitmap_granularity);
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
@@ -497,8 +566,10 @@ void writeback_cache_t::fsync_ok()
|
||||
{
|
||||
if (uw_it->second.state == CACHE_FLUSHING)
|
||||
{
|
||||
if (!--(*uw_it->second.refcnt))
|
||||
if (uw_it->second.buf && !--(*uw_it->second.refcnt))
|
||||
{
|
||||
free(uw_it->second.refcnt);
|
||||
}
|
||||
dirty_buffers.erase(uw_it++);
|
||||
}
|
||||
else
|
||||
|
@@ -889,7 +889,11 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
if (!pc.scrub_interval)
|
||||
pc.scrub_interval = 0;
|
||||
// Mark pool as VitastorFS pool (disable per-inode stats and block volume creation)
|
||||
pc.used_for_fs = pool_item.second["used_for_fs"].as_string();
|
||||
pc.used_for_app = pool_item.second["used_for_fs"].as_string();
|
||||
if (pc.used_for_app != "")
|
||||
pc.used_for_app = "fs:"+pc.used_for_app;
|
||||
else
|
||||
pc.used_for_app = pool_item.second["used_for_app"].as_string();
|
||||
// Immediate Commit Mode
|
||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
||||
@@ -1217,6 +1221,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
.size = value["size"].uint64_value(),
|
||||
.parent_id = parent_inode_num,
|
||||
.readonly = value["readonly"].bool_value(),
|
||||
.deleted = value["deleted"].bool_value(),
|
||||
.meta = value["meta"],
|
||||
.mod_revision = kv.mod_revision,
|
||||
});
|
||||
@@ -1305,6 +1310,10 @@ json11::Json::object etcd_state_client_t::serialize_inode_cfg(inode_config_t *cf
|
||||
{
|
||||
new_cfg["readonly"] = true;
|
||||
}
|
||||
if (cfg->deleted)
|
||||
{
|
||||
new_cfg["deleted"] = true;
|
||||
}
|
||||
if (cfg->meta.is_object())
|
||||
{
|
||||
new_cfg["meta"] = cfg->meta;
|
||||
|
@@ -61,7 +61,7 @@ struct pool_config_t
|
||||
uint64_t pg_stripe_size;
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
uint64_t scrub_interval;
|
||||
std::string used_for_fs;
|
||||
std::string used_for_app;
|
||||
int backfillfull;
|
||||
};
|
||||
|
||||
@@ -72,6 +72,7 @@ struct inode_config_t
|
||||
uint64_t size = 0;
|
||||
inode_t parent_id = 0;
|
||||
bool readonly = false;
|
||||
bool deleted = false;
|
||||
// Arbitrary metadata
|
||||
json11::Json meta;
|
||||
// Change revision of the metadata in etcd
|
||||
|
@@ -275,8 +275,8 @@ const char *help_text =
|
||||
" --foreground 1\n"
|
||||
" Stay in foreground, do not daemonize.\n"
|
||||
"\n"
|
||||
"vitastor-nbd unmap /dev/nbdN\n"
|
||||
" Unmap an ioctl-mapped NBD device.\n"
|
||||
"vitastor-nbd unmap [--force] /dev/nbdN\n"
|
||||
" Unmap an ioctl-mapped NBD device. Do not check if it's actually mapped if --force is specified.\n"
|
||||
"\n"
|
||||
"vitastor-nbd ls [--json]\n"
|
||||
" List ioctl-mapped Vitastor NBD devices, optionally in JSON format.\n"
|
||||
@@ -313,7 +313,7 @@ const char *help_text =
|
||||
#endif
|
||||
"Use vitastor-nbd --help <command> for command details or vitastor-nbd --help --all for all details.\n"
|
||||
"\n"
|
||||
"All usual Vitastor config options like --config_file <path_to_config> may also be specified in CLI.\n"
|
||||
"All usual Vitastor config options like --config_path <path_to_config> may also be specified in CLI.\n"
|
||||
;
|
||||
|
||||
class nbd_proxy
|
||||
@@ -372,7 +372,8 @@ public:
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "all") || i == narg-1 ? "1" : args[++i];
|
||||
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "all") ||
|
||||
!strcmp(opt, "force") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
else if (pos == 0)
|
||||
{
|
||||
@@ -381,8 +382,9 @@ public:
|
||||
}
|
||||
else if (pos == 1)
|
||||
{
|
||||
char c = 0;
|
||||
int n = 0;
|
||||
if (sscanf(args[i], "/dev/nbd%d", &n) > 0)
|
||||
if (sscanf(args[i], "/dev/nbd%d%c", &n, &c) == 1)
|
||||
cfg["dev_num"] = n;
|
||||
else
|
||||
cfg["dev_num"] = args[i];
|
||||
@@ -404,18 +406,14 @@ public:
|
||||
}
|
||||
else if (cfg["command"] == "unmap")
|
||||
{
|
||||
if (cfg["dev_num"].is_null())
|
||||
if (!cfg["dev_num"].is_number() &&
|
||||
cfg["dev_num"].string_value() != "0" &&
|
||||
!cfg["dev_num"].uint64_value())
|
||||
{
|
||||
fprintf(stderr, "device name or number is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (cfg["netlink"].is_null())
|
||||
{
|
||||
ioctl_unmap(cfg["dev_num"].uint64_value());
|
||||
}
|
||||
else
|
||||
{
|
||||
}
|
||||
ioctl_unmap(cfg["dev_num"].uint64_value(), cfg["force"].bool_value());
|
||||
}
|
||||
#ifdef HAVE_NBD_NETLINK_H
|
||||
else if (cfg["command"] == "netlink-map")
|
||||
@@ -444,9 +442,18 @@ help:
|
||||
}
|
||||
}
|
||||
|
||||
void ioctl_unmap(int dev_num)
|
||||
void ioctl_unmap(int dev_num, bool force)
|
||||
{
|
||||
char path[64] = { 0 };
|
||||
// Check if mapped
|
||||
sprintf(path, "/sys/block/nbd%d/pid", dev_num);
|
||||
if (access(path, F_OK) != 0)
|
||||
{
|
||||
fprintf(stderr, "/dev/nbd%d is not mapped: /sys/block/nbd%d/pid does not exist\n", dev_num, dev_num);
|
||||
if (!force)
|
||||
exit(1);
|
||||
}
|
||||
// Run unmap
|
||||
sprintf(path, "/dev/nbd%d", dev_num);
|
||||
int r, nbd = open(path, O_RDWR);
|
||||
if (nbd < 0)
|
||||
@@ -610,36 +617,43 @@ help:
|
||||
{
|
||||
if (!cfg["dev_num"].is_null())
|
||||
{
|
||||
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg) < 0)
|
||||
int r;
|
||||
if ((r = run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg)) != 0)
|
||||
{
|
||||
perror("run_nbd");
|
||||
fprintf(stderr, "run_nbd: %s\n", strerror(-r));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Find an unused device
|
||||
auto mapped = list_mapped();
|
||||
int i = 0;
|
||||
while (true)
|
||||
{
|
||||
if (mapped.find("/dev/nbd"+std::to_string(i)) != mapped.end())
|
||||
{
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg);
|
||||
if (r == 0)
|
||||
{
|
||||
printf("/dev/nbd%d\n", i);
|
||||
break;
|
||||
}
|
||||
else if (r == -1 && errno == ENOENT)
|
||||
else if (r == -ENOENT)
|
||||
{
|
||||
fprintf(stderr, "No free NBD devices found\n");
|
||||
exit(1);
|
||||
}
|
||||
else if (r == -2 && errno == EBUSY)
|
||||
else if (r == -EBUSY)
|
||||
{
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
perror("run_nbd");
|
||||
fprintf(stderr, "run_nbd: %s\n", strerror(-r));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -869,81 +883,114 @@ protected:
|
||||
// Check handle size
|
||||
assert(sizeof(cur_req.handle) == 8);
|
||||
char path[64] = { 0 };
|
||||
sprintf(path, "/dev/nbd%d", dev_num);
|
||||
int r, nbd = open(path, O_RDWR), qd_fd;
|
||||
if (nbd < 0)
|
||||
int notifyfd[2] = { 0 };
|
||||
if (socketpair(AF_UNIX, SOCK_STREAM, 0, notifyfd) < 0)
|
||||
{
|
||||
return -1;
|
||||
return -errno;
|
||||
}
|
||||
r = ioctl(nbd, NBD_SET_SOCK, sockfd[1]);
|
||||
if (r < 0)
|
||||
if (!fork())
|
||||
{
|
||||
goto end_close;
|
||||
}
|
||||
r = ioctl(nbd, NBD_SET_BLKSIZE, 4096);
|
||||
if (r < 0)
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
r = ioctl(nbd, NBD_SET_SIZE, size);
|
||||
if (r < 0)
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
ioctl(nbd, NBD_SET_FLAGS, flags);
|
||||
if (timeout > 0)
|
||||
{
|
||||
r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)timeout);
|
||||
// Do all NBD configuration in the child process, after the last fork.
|
||||
// Why? It's needed because there is a race condition in the Linux kernel nbd driver
|
||||
// in nbd_add_socket() - it saves `current` task pointer as `nbd->task_setup` and
|
||||
// then rechecks if the new `current` is the same. Problem is that if that process
|
||||
// is already dead, `current` may be freed and then replaced by another process
|
||||
// with the same pointer value. So the check passes and NBD allows a different process
|
||||
// to set up a device which is already set up. Proper fix would have to be done in the
|
||||
// kernel code, but the workaround is obviously to perform NBD setup from the process
|
||||
// which will then actually call NBD_DO_IT. That process stays alive during the whole
|
||||
// time of NBD device execution and the (nbd->task_setup != current) check always
|
||||
// works correctly, and we don't accidentally break previous NBD devices while setting
|
||||
// up a new device. Forking to check every device is of course rather slow, so we also
|
||||
// do an additional check by calling list_mapped() before searching for a free NBD device.
|
||||
if (bg)
|
||||
{
|
||||
daemonize_fork();
|
||||
}
|
||||
close(notifyfd[0]);
|
||||
sprintf(path, "/dev/nbd%d", dev_num);
|
||||
int r, nbd = open(path, O_RDWR), qd_fd;
|
||||
if (nbd < 0)
|
||||
{
|
||||
write(notifyfd[1], &errno, sizeof(errno));
|
||||
exit(1);
|
||||
}
|
||||
r = ioctl(nbd, NBD_SET_SOCK, sockfd[1]);
|
||||
if (r < 0)
|
||||
{
|
||||
goto end_close;
|
||||
}
|
||||
r = ioctl(nbd, NBD_SET_BLKSIZE, 4096);
|
||||
if (r < 0)
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
}
|
||||
// Configure request size
|
||||
sprintf(path, "/sys/block/nbd%d/queue/max_sectors_kb", dev_num);
|
||||
qd_fd = open(path, O_WRONLY);
|
||||
if (qd_fd < 0)
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
r = write(qd_fd, "32768", 5);
|
||||
if (r != 5)
|
||||
{
|
||||
fprintf(stderr, "Warning: Failed to configure max_sectors_kb\n");
|
||||
}
|
||||
close(qd_fd);
|
||||
if (!fork())
|
||||
{
|
||||
// Run in child
|
||||
r = ioctl(nbd, NBD_SET_SIZE, size);
|
||||
if (r < 0)
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
ioctl(nbd, NBD_SET_FLAGS, flags);
|
||||
if (timeout > 0)
|
||||
{
|
||||
r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)timeout);
|
||||
if (r < 0)
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
}
|
||||
// Configure request size
|
||||
sprintf(path, "/sys/block/nbd%d/queue/max_sectors_kb", dev_num);
|
||||
qd_fd = open(path, O_WRONLY);
|
||||
if (qd_fd < 0)
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
r = write(qd_fd, "32768", 5);
|
||||
if (r != 5)
|
||||
{
|
||||
fprintf(stderr, "Warning: Failed to configure max_sectors_kb\n");
|
||||
}
|
||||
close(qd_fd);
|
||||
// Notify parent
|
||||
errno = 0;
|
||||
write(notifyfd[1], &errno, sizeof(errno));
|
||||
close(notifyfd[1]);
|
||||
close(sockfd[0]);
|
||||
if (bg)
|
||||
{
|
||||
daemonize();
|
||||
daemonize_reopen_stdio();
|
||||
}
|
||||
r = ioctl(nbd, NBD_DO_IT);
|
||||
if (r < 0)
|
||||
{
|
||||
fprintf(stderr, "NBD device terminated with error: %s\n", strerror(errno));
|
||||
fprintf(stderr, "NBD device /dev/nbd%d terminated with error: %s\n", dev_num, strerror(errno));
|
||||
}
|
||||
close(sockfd[1]);
|
||||
ioctl(nbd, NBD_CLEAR_QUE);
|
||||
ioctl(nbd, NBD_CLEAR_SOCK);
|
||||
exit(0);
|
||||
}
|
||||
close(sockfd[1]);
|
||||
close(nbd);
|
||||
return 0;
|
||||
end_close:
|
||||
r = errno;
|
||||
close(nbd);
|
||||
errno = r;
|
||||
return -2;
|
||||
write(notifyfd[1], &errno, sizeof(errno));
|
||||
close(nbd);
|
||||
exit(2);
|
||||
end_unmap:
|
||||
r = errno;
|
||||
ioctl(nbd, NBD_CLEAR_SOCK);
|
||||
close(nbd);
|
||||
errno = r;
|
||||
return -3;
|
||||
write(notifyfd[1], &errno, sizeof(errno));
|
||||
ioctl(nbd, NBD_CLEAR_SOCK);
|
||||
close(nbd);
|
||||
exit(3);
|
||||
}
|
||||
// Parent - check status
|
||||
close(notifyfd[1]);
|
||||
int child_errno = 0;
|
||||
int ok = read(notifyfd[0], &child_errno, sizeof(child_errno));
|
||||
close(notifyfd[0]);
|
||||
if (ok && !child_errno)
|
||||
{
|
||||
close(sockfd[1]);
|
||||
return 0;
|
||||
}
|
||||
return -child_errno;
|
||||
}
|
||||
|
||||
void submit_send()
|
||||
|
@@ -51,6 +51,11 @@
|
||||
#define LOC_CORRUPTED 2
|
||||
#define LOC_INCONSISTENT 4
|
||||
|
||||
#define OSD_LIST_PRIMARY 1
|
||||
|
||||
#define OSD_DEL_SUPPORT_LEFT_ON_DEAD 1
|
||||
#define OSD_DEL_LEFT_ON_DEAD 2
|
||||
|
||||
// common request and reply headers
|
||||
struct __attribute__((__packed__)) osd_op_header_t
|
||||
{
|
||||
@@ -196,6 +201,9 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
|
||||
uint64_t min_stripe, max_stripe;
|
||||
// max stable object count
|
||||
uint32_t stable_limit;
|
||||
// flags - OSD_LIST_PRIMARY or 0
|
||||
// for OSD_LIST_PRIMARY, only a single-PG listing is allowed
|
||||
uint64_t flags;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_sec_list_t
|
||||
@@ -204,9 +212,11 @@ struct __attribute__((__packed__)) osd_reply_sec_list_t
|
||||
// stable object version count. header.retval = total object version count
|
||||
// FIXME: maybe change to the number of bytes in the reply...
|
||||
uint64_t stable_count;
|
||||
// flags - OSD_LIST_PRIMARY or 0
|
||||
uint64_t flags;
|
||||
};
|
||||
|
||||
// read or write to the primary OSD (must be within individual stripe)
|
||||
// read, write or delete command for the primary OSD (must be within individual stripe)
|
||||
struct __attribute__((__packed__)) osd_op_rw_t
|
||||
{
|
||||
osd_op_header_t header;
|
||||
@@ -235,6 +245,20 @@ struct __attribute__((__packed__)) osd_reply_rw_t
|
||||
uint64_t version;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_del_t
|
||||
{
|
||||
osd_reply_header_t header;
|
||||
// OSD_DEL_SUPPORT_LEFT_ON_DEAD and/or OSD_DEL_LEFT_ON_DEAD or 0
|
||||
uint32_t flags;
|
||||
// for deletes, if flags & OSD_DEL_LEFT_ON_DEAD:
|
||||
// count of OSDs from which the object could be not deleted
|
||||
// these come directly after this del_left_on_dead_list_size as uint32_t[]
|
||||
// FIXME it's kind of a hack and will be removed in the future, when Vitastor will
|
||||
// have 'atomic deletions', i.e. when it will be able to remember deleted objects
|
||||
// and complete deletions automatically after extra OSDs are started
|
||||
uint32_t left_on_dead_count;
|
||||
};
|
||||
|
||||
// sync to the primary OSD
|
||||
struct __attribute__((__packed__)) osd_op_sync_t
|
||||
{
|
||||
@@ -307,6 +331,7 @@ union osd_any_reply_t
|
||||
osd_reply_sec_list_t sec_list;
|
||||
osd_reply_show_config_t show_conf;
|
||||
osd_reply_rw_t rw;
|
||||
osd_reply_del_t del;
|
||||
osd_reply_sync_t sync;
|
||||
osd_reply_describe_t describe;
|
||||
uint8_t buf[OSD_PACKET_SIZE];
|
||||
|
@@ -294,7 +294,9 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
|
||||
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_schedule_uring_handler(client);
|
||||
#endif
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task->complete)
|
||||
@@ -566,6 +568,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
static void vitastor_close(BlockDriverState *bs)
|
||||
{
|
||||
VitastorClient *client = bs->opaque;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_destroy(client->proxy);
|
||||
if (client->fds)
|
||||
{
|
||||
@@ -573,7 +576,6 @@ static void vitastor_close(BlockDriverState *bs)
|
||||
client->fds = NULL;
|
||||
client->fd_alloc = client->fd_count = 0;
|
||||
}
|
||||
qemu_mutex_destroy(&client->mutex);
|
||||
if (client->config_path)
|
||||
g_free(client->config_path);
|
||||
if (client->etcd_host)
|
||||
@@ -584,6 +586,8 @@ static void vitastor_close(BlockDriverState *bs)
|
||||
g_free(client->image);
|
||||
free(client->last_bitmap);
|
||||
client->last_bitmap = NULL;
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
qemu_mutex_destroy(&client->mutex);
|
||||
}
|
||||
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
||||
@@ -749,7 +753,9 @@ static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_schedule_uring_handler(client);
|
||||
#endif
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task.complete)
|
||||
@@ -783,7 +789,9 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_schedule_uring_handler(client);
|
||||
#endif
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task.complete)
|
||||
@@ -863,7 +871,9 @@ static int coroutine_fn vitastor_co_block_status(
|
||||
task.bitmap = client->last_bitmap = NULL;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_schedule_uring_handler(client);
|
||||
#endif
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
while (!task.complete)
|
||||
{
|
||||
@@ -950,7 +960,9 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
|
||||
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
|
||||
#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
|
||||
vitastor_schedule_uring_handler(client);
|
||||
#endif
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task.complete)
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 1.10.0
|
||||
Version: 1.11.0
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -127,6 +127,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
|
||||
auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
|
||||
self->ringloop = ringloop;
|
||||
self->cli = new cluster_client_t(self->ringloop, self->tfd, cfg_json);
|
||||
ringloop->loop();
|
||||
return self;
|
||||
}
|
||||
|
||||
@@ -150,6 +151,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
|
||||
self->ringloop = ringloop;
|
||||
self->epmgr = new epoll_manager_t(self->ringloop);
|
||||
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
|
||||
ringloop->loop();
|
||||
return self;
|
||||
}
|
||||
|
||||
@@ -183,6 +185,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
|
||||
self->ringloop = ringloop;
|
||||
self->epmgr = new epoll_manager_t(self->ringloop);
|
||||
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
|
||||
ringloop->loop();
|
||||
return self;
|
||||
}
|
||||
|
||||
@@ -222,6 +225,18 @@ int vitastor_c_is_ready(vitastor_c *client)
|
||||
return client->cli->is_ready();
|
||||
}
|
||||
|
||||
void vitastor_c_on_ready(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||
{
|
||||
client->cli->on_ready([=]()
|
||||
{
|
||||
cb(opaque, 0);
|
||||
});
|
||||
if (client->ringloop)
|
||||
{
|
||||
client->ringloop->loop();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_uring_wait_ready(vitastor_c *client)
|
||||
{
|
||||
while (!client->cli->is_ready())
|
||||
@@ -276,6 +291,10 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
|
||||
delete op;
|
||||
};
|
||||
client->cli->execute(op);
|
||||
if (client->ringloop)
|
||||
{
|
||||
client->ringloop->loop();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||
@@ -297,6 +316,31 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
|
||||
delete op;
|
||||
};
|
||||
client->cli->execute(op);
|
||||
if (client->ringloop)
|
||||
{
|
||||
client->ringloop->loop();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_delete(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||
VitastorIOHandler cb, void *opaque)
|
||||
{
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->opcode = OSD_OP_DELETE;
|
||||
op->inode = inode;
|
||||
op->offset = offset;
|
||||
op->len = len;
|
||||
op->version = check_version;
|
||||
op->callback = [cb, opaque](cluster_op_t *op)
|
||||
{
|
||||
cb(opaque, op->retval);
|
||||
delete op;
|
||||
};
|
||||
client->cli->execute(op);
|
||||
if (client->ringloop)
|
||||
{
|
||||
client->ringloop->loop();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
@@ -319,6 +363,10 @@ void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset,
|
||||
delete op;
|
||||
};
|
||||
client->cli->execute(op);
|
||||
if (client->ringloop)
|
||||
{
|
||||
client->ringloop->loop();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||
@@ -331,6 +379,10 @@ void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||
delete op;
|
||||
};
|
||||
client->cli->execute(op);
|
||||
if (client->ringloop)
|
||||
{
|
||||
client->ringloop->loop();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque)
|
||||
@@ -340,6 +392,10 @@ void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler c
|
||||
auto watch = client->cli->st_cli.watch_inode(std::string(image));
|
||||
cb(opaque, (long)watch);
|
||||
});
|
||||
if (client->ringloop)
|
||||
{
|
||||
client->ringloop->loop();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_close_watch(vitastor_c *client, void *handle)
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#define VITASTOR_QEMU_PROXY_H
|
||||
|
||||
// C API wrapper version
|
||||
#define VITASTOR_C_API_VERSION 4
|
||||
#define VITASTOR_C_API_VERSION 5
|
||||
|
||||
#ifndef POOL_ID_BITS
|
||||
#define POOL_ID_BITS 16
|
||||
@@ -51,6 +51,7 @@ vitastor_c *vitastor_c_create_epoll_json(const char **options, int options_len);
|
||||
void* vitastor_c_get_internal_client(vitastor_c *client);
|
||||
void vitastor_c_destroy(vitastor_c *client);
|
||||
int vitastor_c_is_ready(vitastor_c *client);
|
||||
void vitastor_c_on_ready(vitastor_c *client, VitastorIOHandler cb, void *opaque);
|
||||
int vitastor_c_uring_register_eventfd(vitastor_c *client);
|
||||
void vitastor_c_uring_wait_ready(vitastor_c *client);
|
||||
void vitastor_c_uring_handle_events(vitastor_c *client);
|
||||
@@ -62,6 +63,8 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
|
||||
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
|
||||
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_delete(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||
VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
int with_parents, VitastorReadBitmapHandler cb, void *opaque);
|
||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
|
||||
|
@@ -51,8 +51,9 @@ static const char* help_text =
|
||||
" Rename, resize image or change its readonly status. Images with children can't be made read-write.\n"
|
||||
" If the new size is smaller than the old size, extra data will be purged.\n"
|
||||
" You should resize file system in the image, if present, before shrinking it.\n"
|
||||
" -f|--force Proceed with shrinking or setting readwrite flag even if the image has children.\n"
|
||||
" --down-ok Proceed with shrinking even if some data will be left on unavailable OSDs.\n"
|
||||
" --deleted 1|0 Set/clear 'deleted image' flag (set automatically during unfinished deletes).\n"
|
||||
" -f|--force Proceed with shrinking or setting readwrite flag even if the image has children.\n"
|
||||
" --down-ok Proceed with shrinking even if some data will be left on unavailable OSDs.\n"
|
||||
"\n"
|
||||
"vitastor-cli dd [iimg=<image> | if=<file>] [oimg=<image> | of=<file>] [bs=1M]\n"
|
||||
" [count=N] [seek/oseek=N] [skip/iseek=M] [iodepth=N] [status=progress]\n"
|
||||
@@ -101,6 +102,7 @@ static const char* help_text =
|
||||
" Requires more memory, but allows to show correct removal progress.\n"
|
||||
" --min-offset Purge only data starting with specified offset.\n"
|
||||
" --max-offset Purge only data before specified offset.\n"
|
||||
" --client_wait_up_timeout 16 Timeout for waiting until PGs are up in seconds.\n"
|
||||
"\n"
|
||||
"vitastor-cli merge-data <from> <to> [--target <target>]\n"
|
||||
" Merge layer data without changing metadata. Merge <from>..<to> to <target>.\n"
|
||||
@@ -184,7 +186,8 @@ static const char* help_text =
|
||||
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
||||
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
||||
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
||||
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
||||
" --used_for_app fs:<name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
||||
" --used_for_app s3:<name> Mark pool as used for S3 location with name <name>\n"
|
||||
" --pg_stripe_size <number> Increase object grouping stripe\n"
|
||||
" --max_osd_combinations 10000 Maximum number of random combinations for LP solver input\n"
|
||||
" --wait Wait for the new pool to come online\n"
|
||||
@@ -196,7 +199,7 @@ static const char* help_text =
|
||||
"vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]\n"
|
||||
" Modify an existing pool. Modifiable parameters:\n"
|
||||
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
|
||||
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
|
||||
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_app <type>:<name>]\n"
|
||||
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
|
||||
" [--level_placement <rules>] [--raw_placement <rules>]\n"
|
||||
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
|
||||
@@ -222,7 +225,7 @@ static const char* help_text =
|
||||
"Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
|
||||
"\n"
|
||||
"GLOBAL OPTIONS:\n"
|
||||
" --config_file FILE Path to Vitastor configuration file\n"
|
||||
" --config_path FILE Path to Vitastor configuration file\n"
|
||||
" --etcd_address URL Etcd connection address\n"
|
||||
" --iodepth N Send N operations in parallel to each OSD when possible (default 32)\n"
|
||||
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
|
||||
@@ -430,13 +433,22 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
else if (cmd[0] == "rm")
|
||||
{
|
||||
// Remove multiple snapshots and rebase their children
|
||||
if (cmd.size() > 1)
|
||||
if (cfg["exact"].bool_value() || cfg["matching"].bool_value())
|
||||
{
|
||||
cfg["from"] = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
cfg["to"] = cmd[2];
|
||||
cmd.erase(cmd.begin(), cmd.begin()+1);
|
||||
cfg["globs"] = cmd;
|
||||
action_cb = p->start_rm_wildcard(cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["from"] = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
cfg["to"] = cmd[2];
|
||||
}
|
||||
action_cb = p->start_rm(cfg);
|
||||
}
|
||||
action_cb = p->start_rm(cfg);
|
||||
}
|
||||
else if (cmd[0] == "describe")
|
||||
{
|
||||
|
@@ -30,7 +30,7 @@ struct cli_result_t
|
||||
class cli_tool_t
|
||||
{
|
||||
public:
|
||||
uint64_t iodepth = 4, parallel_osds = 32;
|
||||
uint64_t iodepth = 32, parallel_osds = 4;
|
||||
bool progress = false;
|
||||
bool list_first = false;
|
||||
bool json_output = false;
|
||||
|
@@ -92,12 +92,12 @@ struct image_creator_t
|
||||
{
|
||||
new_pool_id = pools.begin()->first;
|
||||
}
|
||||
if (new_pool_id && !pools.at(new_pool_id).used_for_fs.empty() && !force)
|
||||
if (new_pool_id && !pools.at(new_pool_id).used_for_app.empty() && !force)
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EINVAL,
|
||||
.text = "Pool "+pools.at(new_pool_id).name+
|
||||
" is used for VitastorFS "+pools.at(new_pool_id).used_for_fs+
|
||||
" is used for application "+pools.at(new_pool_id).used_for_app+
|
||||
". Use --force if you really know what you are doing",
|
||||
};
|
||||
state = 100;
|
||||
@@ -192,7 +192,7 @@ resume_3:
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
new_cfg.mod_revision = parent->etcd_result["header"]["revision"].uint64_value();
|
||||
parent->cli->st_cli.insert_inode_config(new_cfg);
|
||||
result = (cli_result_t){
|
||||
.err = 0,
|
||||
@@ -215,6 +215,7 @@ resume_3:
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
// FIXME: take all info from etcd requests, not mixed with st_cli.inode_config
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == image_name+"@"+new_snap)
|
||||
@@ -269,7 +270,7 @@ resume_4:
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
new_cfg.mod_revision = parent->etcd_result["header"]["revision"].uint64_value();
|
||||
parent->cli->st_cli.insert_inode_config(new_cfg);
|
||||
result = (cli_result_t){
|
||||
.err = 0,
|
||||
@@ -286,6 +287,7 @@ resume_4:
|
||||
|
||||
json11::Json::object get_next_id()
|
||||
{
|
||||
assert(new_pool_id);
|
||||
return json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
@@ -321,6 +323,17 @@ resume_4:
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
if (!new_pool_id)
|
||||
{
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == image_name)
|
||||
{
|
||||
new_pool_id = INODE_POOL(ic.first);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object { { "success", json11::Json::array {
|
||||
get_next_id(),
|
||||
json11::Json::object {
|
||||
@@ -401,7 +414,7 @@ resume_3:
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
|
||||
size = kv.value["size"].uint64_value();
|
||||
new_parent_id = kv.value["parent_id"].uint64_value();
|
||||
uint64_t parent_pool_id = kv.value["parent_pool_id"].uint64_value();
|
||||
uint64_t parent_pool_id = kv.value["parent_pool"].uint64_value();
|
||||
if (new_parent_id)
|
||||
{
|
||||
new_parent_id = INODE_WITH_POOL(parent_pool_id ? parent_pool_id : old_pool_id, new_parent_id);
|
||||
@@ -413,7 +426,7 @@ resume_3:
|
||||
|
||||
void attempt_create()
|
||||
{
|
||||
new_cfg = {
|
||||
new_cfg = (inode_config_t){
|
||||
.num = INODE_WITH_POOL(new_pool_id, new_id),
|
||||
.name = image_name,
|
||||
.size = size,
|
||||
|
@@ -68,6 +68,7 @@ struct image_lister_t
|
||||
{ "pool_name", good_pool ? pool_it->second.name : "? (ID:"+std::to_string(INODE_POOL(ic.second.num))+")" },
|
||||
{ "inode_num", INODE_NO_POOL(ic.second.num) },
|
||||
{ "inode_id", ic.second.num },
|
||||
{ "deleted", ic.second.deleted },
|
||||
};
|
||||
if (ic.second.parent_id)
|
||||
{
|
||||
@@ -371,7 +372,8 @@ resume_1:
|
||||
kv.second["delete_q"] = format_q(kv.second["delete_queue"].number_value());
|
||||
}
|
||||
kv.second["size_fmt"] = format_size(kv.second["size"].uint64_value());
|
||||
kv.second["ro"] = kv.second["readonly"].bool_value() ? "RO" : "-";
|
||||
kv.second["ro"] = kv.second["deleted"].bool_value() ? "DEL" :
|
||||
(kv.second["readonly"].bool_value() ? "RO" : "-");
|
||||
}
|
||||
result.text = print_table(to_list(), cols, parent->color);
|
||||
state = 100;
|
||||
@@ -544,7 +546,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
|
||||
lister->list_pool_name = lister->list_pool_id ? "" : cfg["pool"].as_string();
|
||||
lister->show_stats = cfg["long"].bool_value();
|
||||
lister->show_delete = cfg["del"].bool_value();
|
||||
lister->sort_field = cfg["sort"].string_value();
|
||||
lister->sort_field = cfg["sort"].string_value() != "" ? cfg["sort"].string_value() : "name";
|
||||
lister->reverse = cfg["reverse"].bool_value();
|
||||
lister->max_count = cfg["count"].uint64_value();
|
||||
for (auto & item: cfg["names"].array_items())
|
||||
|
@@ -51,6 +51,7 @@ struct snap_merger_t
|
||||
btree::safe_btree_set<uint64_t> merge_offsets;
|
||||
btree::safe_btree_set<uint64_t>::iterator oit;
|
||||
std::map<inode_t, std::vector<uint64_t>> layer_lists;
|
||||
std::map<inode_t, int> list_errcode;
|
||||
std::map<inode_t, uint64_t> layer_block_size;
|
||||
std::map<inode_t, uint64_t> layer_list_pos;
|
||||
std::vector<snap_rw_op_t*> continue_rwo, continue_rwo2;
|
||||
@@ -251,6 +252,7 @@ struct snap_merger_t
|
||||
// Get parents and so on
|
||||
start_merge();
|
||||
// First list lower layers
|
||||
list_errcode.clear();
|
||||
list_layers(true);
|
||||
state = 1;
|
||||
resume_1:
|
||||
@@ -259,6 +261,15 @@ struct snap_merger_t
|
||||
// Wait for lists
|
||||
return;
|
||||
}
|
||||
if (list_errcode.size())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EIO,
|
||||
.text = "Failed to list lower layer(s) in some PGs, merging would be incorrect",
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (merge_offsets.size() > 0)
|
||||
{
|
||||
state = 2;
|
||||
@@ -295,6 +306,7 @@ struct snap_merger_t
|
||||
state = 3;
|
||||
resume_3:
|
||||
// Then list upper layers
|
||||
list_errcode.clear();
|
||||
list_layers(false);
|
||||
state = 4;
|
||||
resume_4:
|
||||
@@ -303,6 +315,15 @@ struct snap_merger_t
|
||||
// Wait for lists
|
||||
return;
|
||||
}
|
||||
if (list_errcode.size() > 0)
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EIO,
|
||||
.text = "Failed to list upper layer(s) in some PGs, merging would be incorrect",
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
state = 5;
|
||||
processed = 0;
|
||||
to_process = merge_offsets.size();
|
||||
@@ -368,9 +389,13 @@ struct snap_merger_t
|
||||
if (lower ? (sp.second < target_rank) : (sp.second > target_rank))
|
||||
{
|
||||
lists_todo++;
|
||||
inode_list_t* lst = parent->cli->list_inode_start(src, [this, src](
|
||||
inode_list_t *lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
parent->cli->list_inode(src, 0, 0, parent->parallel_osds, [this, src](
|
||||
int errcode, int pgs_left, pg_num_t pg_num, std::set<object_id>&& objects)
|
||||
{
|
||||
if (errcode)
|
||||
{
|
||||
list_errcode[src] = errcode;
|
||||
}
|
||||
uint64_t layer_block = layer_block_size.at(src);
|
||||
for (object_id obj: objects)
|
||||
{
|
||||
@@ -391,12 +416,18 @@ struct snap_merger_t
|
||||
layer_list[pos++] = obj.stripe;
|
||||
}
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
if (!pgs_left)
|
||||
{
|
||||
auto & name = parent->cli->st_cli.inode_config.at(src).name;
|
||||
if (parent->progress)
|
||||
if (list_errcode.find(src) != list_errcode.end())
|
||||
{
|
||||
printf("Got listing of layer %s (inode %ju in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
fprintf(stderr, "Failed to get listing of layer %s (inode %ju in pool %u): %s (code %d)\n",
|
||||
name.c_str(), INODE_NO_POOL(src), INODE_POOL(src), strerror(-list_errcode[src]), list_errcode[src]);
|
||||
}
|
||||
else if (parent->progress)
|
||||
{
|
||||
fprintf(stderr, "Got listing of layer %s (inode %ju in pool %u)\n",
|
||||
name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
}
|
||||
if (delete_source)
|
||||
{
|
||||
@@ -406,12 +437,7 @@ struct snap_merger_t
|
||||
lists_todo--;
|
||||
continue_merge_reent();
|
||||
}
|
||||
else
|
||||
{
|
||||
parent->cli->list_inode_next(lst, 1);
|
||||
}
|
||||
});
|
||||
parent->cli->list_inode_next(lst, parent->parallel_osds);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -428,7 +454,7 @@ struct snap_merger_t
|
||||
{
|
||||
if (op->retval < 0)
|
||||
{
|
||||
fprintf(stderr, "error reading target bitmap at offset %jx: %s\n", op->offset, strerror(-op->retval));
|
||||
fprintf(stderr, "Warning: failed to read target bitmap at offset %jx: %s\n", op->offset, strerror(-op->retval));
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -585,7 +611,7 @@ struct snap_merger_t
|
||||
subop->inode = inode_num;
|
||||
subop->offset = offset;
|
||||
subop->len = 0;
|
||||
subop->flags = OSD_OP_IGNORE_READONLY;
|
||||
subop->flags = OSD_OP_IGNORE_READONLY | OSD_OP_WAIT_UP_TIMEOUT;
|
||||
subop->callback = [](cluster_op_t *subop)
|
||||
{
|
||||
if (subop->retval != 0)
|
||||
|
@@ -4,6 +4,7 @@
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "str_util.h"
|
||||
#include "json_util.h"
|
||||
|
||||
// Rename, resize image (and purge extra data on shrink) or change its readonly status
|
||||
struct image_changer_t
|
||||
@@ -15,6 +16,7 @@ struct image_changer_t
|
||||
uint64_t new_size = 0;
|
||||
bool force_size = false, inc_size = false;
|
||||
bool set_readonly = false, set_readwrite = false, force = false;
|
||||
bool set_deleted = false, new_deleted = false;
|
||||
bool down_ok = false;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
@@ -82,6 +84,7 @@ struct image_changer_t
|
||||
}
|
||||
if ((!set_readwrite || !cfg.readonly) &&
|
||||
(!set_readonly || cfg.readonly) &&
|
||||
(!set_deleted || cfg.deleted == new_deleted) &&
|
||||
(!new_size && !force_size || cfg.size == new_size || cfg.size >= new_size && inc_size) &&
|
||||
(new_name == "" || new_name == image_name))
|
||||
{
|
||||
@@ -141,6 +144,10 @@ resume_1:
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (set_deleted)
|
||||
{
|
||||
cfg.deleted = new_deleted;
|
||||
}
|
||||
if (new_name != "")
|
||||
{
|
||||
cfg.name = new_name;
|
||||
@@ -219,7 +226,7 @@ resume_2:
|
||||
return;
|
||||
}
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
cfg.mod_revision = parent->etcd_result["header"]["revision"].uint64_value();
|
||||
if (new_name != "")
|
||||
{
|
||||
parent->cli->st_cli.inode_by_name.erase(image_name);
|
||||
@@ -251,6 +258,8 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
changer->force = cfg["force"].bool_value();
|
||||
changer->set_readonly = cfg["readonly"].bool_value();
|
||||
changer->set_readwrite = cfg["readwrite"].bool_value();
|
||||
changer->set_deleted = !cfg["deleted"].is_null();
|
||||
changer->new_deleted = json_is_true(cfg["deleted"]);
|
||||
changer->fsync_interval = cfg["fsync_interval"].uint64_value();
|
||||
if (!changer->fsync_interval)
|
||||
changer->fsync_interval = 128;
|
||||
|
@@ -90,8 +90,8 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||
value = sz;
|
||||
}
|
||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
|
||||
key == "raw_placement")
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_app" ||
|
||||
key == "used_for_fs" || key == "raw_placement")
|
||||
{
|
||||
if (!value.is_string())
|
||||
{
|
||||
@@ -156,8 +156,13 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||
{
|
||||
new_cfg.erase("parity_chunks");
|
||||
}
|
||||
if (new_cfg.find("used_for_fs") != new_cfg.end() && new_cfg["used_for_fs"].string_value() == "")
|
||||
if (new_cfg.find("used_for_app") != new_cfg.end() && new_cfg["used_for_app"].string_value() == "")
|
||||
{
|
||||
new_cfg.erase("used_for_app");
|
||||
}
|
||||
if (new_cfg.find("used_for_app") == new_cfg.end() && new_cfg.find("used_for_fs") != new_cfg.end())
|
||||
{
|
||||
new_cfg["used_for_app"] = "fs:"+new_cfg["used_for_fs"].string_value();
|
||||
new_cfg.erase("used_for_fs");
|
||||
}
|
||||
|
||||
|
@@ -460,7 +460,7 @@ resume_8:
|
||||
}
|
||||
if (osd_bs && osd_bs != UINT32_MAX && osd_bs != parent->cli->st_cli.global_block_size)
|
||||
{
|
||||
fprintf(stderr, "Auto-selecting block_size=%s because all pool OSDs use it\n", format_size(osd_bs).c_str());
|
||||
fprintf(stderr, "Auto-selecting block_size=%s because all pool OSDs use it\n", format_size(osd_bs, false, true).c_str());
|
||||
upd["block_size"] = osd_bs;
|
||||
}
|
||||
}
|
||||
@@ -479,7 +479,7 @@ resume_8:
|
||||
}
|
||||
if (osd_bg && osd_bg != UINT32_MAX && osd_bg != parent->cli->st_cli.global_bitmap_granularity)
|
||||
{
|
||||
fprintf(stderr, "Auto-selecting bitmap_granularity=%s because all pool OSDs use it\n", format_size(osd_bg).c_str());
|
||||
fprintf(stderr, "Auto-selecting bitmap_granularity=%s because all pool OSDs use it\n", format_size(osd_bg, false, true).c_str());
|
||||
upd["bitmap_granularity"] = osd_bg;
|
||||
}
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user