Compare commits

..

23 Commits

Author SHA1 Message Date
5a9e1ede52 Release 0.8.9
- The tests are now stable and run in a CI system based on Gitea CI
- The release includes final bug fixes for EC:
  - Implement missing EC recovery of allocation bitmap when built with ISA-L
  - Fix broken snapshot export with EC (allocation bitmap reads were giving incorrect results previously)
- Also fixed bugs manifesting under heavy load:
  - Fix monitor possibly applying incorrect PG history on retries
  - Fix monitor incorrectly changing PG count when last_clean_pgs contains less PGs than the new number
  - Allow writes to wait for free space again, but now correctly (previously dropped in 0.8.2)
  - Fix a rare segfault in client (handle client stop during incoming stream handling in 1 more place)
  - Make monitor correctly handle etcd connection errors - it could die instead of connecting to another etcd
  - Fix OSD rarely being unable to report PG states after a PG was taken over by another OSD
- Fixed return code for incomplete EC objects (now EIO) and made cluster client retry this error
- Made other small changes for tests: timeouts, nice/ionice for etcd, waiting conditions, NBD device checks and so on
2023-05-14 01:25:09 +03:00
1c9a188600 Add tests to CI 2023-05-14 00:06:09 +03:00
de3e609166 Add a FIXME about QEMU driver thread safety 2023-05-14 00:06:09 +03:00
11481170f5 Add a FIXME about ENOSPC 2023-05-13 23:59:44 +03:00
e69d459d43 Allow rebalance to start in test_interrupted_rebalance, raise etcd start timeout 2023-05-13 15:16:28 +03:00
da82754baa Wait for conditions in test_move_reappear instead of waiting a fixed amount of time 2023-05-12 23:18:07 +03:00
d356aca030 Add missing $NO_SAME OSD argument to test_splitbrain 2023-05-12 23:18:07 +03:00
04a273d213 Raise NBD timeout in tests 2023-05-12 23:18:07 +03:00
6442010f93 Skip offline PGs during state reporting when the state is already deleted or taken over by another OSD
This fixes OSDs being unable to report PG states in rare conditions
2023-05-12 23:17:45 +03:00
6f4dc16c59 Handle etcd connection errors correctly in mon (unhandled error events) 2023-05-11 11:02:44 +03:00
ce4a8067b5 Handle client stop during incoming stream handling in 1 more place 2023-05-11 01:53:41 +03:00
e431ecb715 Make tests more stable in CI 2023-05-11 01:53:41 +03:00
8cac795445 Return EIO instead of EINVAL for incomplete EC objects 2023-05-11 01:15:23 +03:00
a409598b16 Wait for free space again, but count on big_write flushes instead of just flusher activity 2023-05-10 01:51:02 +03:00
f4c6765522 Ignore ENOENT in epoll_ctl 2023-05-08 20:39:20 +03:00
ad2916068a Fix test_add_osd rebalance timeout check 2023-05-08 20:39:20 +03:00
321cb435a6 Fix monitor incorrectly changing PG count when last_clean_pgs contains less PGs than the new number 2023-05-08 20:39:20 +03:00
cfcf4f4355 Support checking /dev/nbdX nodes in Docker 2023-05-08 20:39:20 +03:00
e0fb17bfee Make etcd more stable in tests (add ionice and raise timeout) 2023-05-08 20:36:00 +03:00
5b9031fecc Fix monitor possibly applying incorrect PG history under heavy load
Monitor could deceive itself by immediately saving PG configuration changes
which weren't applied to etcd yet in memory, and apply incorrect PG history
changes next time if the first update fails.

This usually only happened under heavy load and was caught in CI. :-)
2023-05-07 23:23:00 +03:00
5da1d8e1b5 Fix EC just-bitmap reads (len=0) (fixes SCHEME=ec test_snapshot.sh) 2023-05-07 14:00:08 +03:00
44f86f1999 Add a basic EC 2+2 recovery test (not really required, but let it be there) 2023-05-07 11:26:27 +03:00
2d9a80c6f6 Implement missing bitmap recovery with ISA-L \(°□°)/ 2023-05-07 11:25:51 +03:00
65 changed files with 1171 additions and 716 deletions

View File

@@ -0,0 +1,36 @@
FROM node:16-bullseye
WORKDIR /root
ADD ./docker/vitastor.gpg /etc/apt/trusted.gpg.d
RUN echo 'deb http://deb.debian.org/debian bullseye-backports main' >> /etc/apt/sources.list; \
echo 'deb http://vitastor.io/debian bullseye main' >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
echo 'Pin: release a=bullseye-backports' >> /etc/apt/preferences; \
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
echo 'Pin: origin "vitastor.io"' >> /etc/apt/preferences; \
echo 'Pin-Priority: 1000' >> /etc/apt/preferences; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get -y install jq lp-solve sudo
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN set -ex; \
mkdir qemu-build; \
cd qemu-build; \
dpkg-source -x /root/qemu*.dsc; \
cd qemu*/; \
debian/rules configure-qemu || debian/rules b/configure-stamp; \
cd b/qemu; \
make -j8 config-poison.h || true; \
make -j8 qapi/qapi-builtin-types.h

View File

@@ -0,0 +1,16 @@
FROM git.yourcmc.ru/vitalif/vitastor/buildenv
ADD . /root/vitastor
RUN set -e -x; \
mkdir -p /root/fio-build/; \
cd /root/fio-build/; \
dpkg-source -x /root/fio*.dsc; \
cd /root/vitastor; \
ln -s /root/fio-build/fio-*/ ./fio; \
ln -s /root/qemu-build/qemu-*/ ./qemu; \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
mkdir build; \
cd build; \
cmake .. -DWITH_ASAN=yes -DWITH_QEMU=yes; \
make -j16

552
.gitea/workflows/test.yml Normal file
View File

@@ -0,0 +1,552 @@
name: Test
on:
push:
branches:
- '*'
paths:
- '.gitea/**'
- 'src/**'
- 'mon/**'
- 'json11'
- 'cpp-btree'
- 'tests/**'
env:
BUILDENV_IMAGE: git.yourcmc.ru/vitalif/vitastor/buildenv
TEST_IMAGE: git.yourcmc.ru/vitalif/vitastor/test
OSD_ARGS: '--etcd_quick_timeout 2000'
concurrency:
group: ci-${{ github.ref }}
cancel-in-progress: true
jobs:
buildenv:
runs-on: ubuntu-latest
container: git.yourcmc.ru/vitalif/gitea-ci-dind
steps:
- uses: actions/checkout@v3
- name: Build and push
run: |
set -ex
if ! docker manifest inspect $BUILDENV_IMAGE >/dev/null; then
docker build -t $BUILDENV_IMAGE -f .gitea/workflows/buildenv.Dockerfile .
docker login git.yourcmc.ru -u vitalif -p "${{secrets.TOKEN}}"
docker push $BUILDENV_IMAGE
fi
build:
runs-on: ubuntu-latest
needs: buildenv
container: git.yourcmc.ru/vitalif/gitea-ci-dind
steps:
- uses: actions/checkout@v3
with:
submodules: true
- name: Build and push
run: |
set -ex
if ! docker manifest inspect $TEST_IMAGE:$GITHUB_SHA >/dev/null; then
docker build -t $TEST_IMAGE:$GITHUB_SHA -f .gitea/workflows/test.Dockerfile .
docker login git.yourcmc.ru -u vitalif -p "${{secrets.TOKEN}}"
docker push $TEST_IMAGE:$GITHUB_SHA
fi
make_test:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
# leak sanitizer sometimes crashes
- run: cd /root/vitastor/build && ASAN_OPTIONS=detect_leaks=0 make -j16 test
test_add_osd:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_add_osd.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_cas:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_cas.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_change_pg_count:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_change_pg_count.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_change_pg_count_ec:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=ec /root/vitastor/tests/test_change_pg_count.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_change_pg_size:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_change_pg_size.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_create_nomaxid:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_create_nomaxid.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_etcd_fail:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: /root/vitastor/tests/test_etcd_fail.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_failure_domain:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_failure_domain.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_interrupted_rebalance:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: /root/vitastor/tests/test_interrupted_rebalance.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_interrupted_rebalance_imm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_interrupted_rebalance.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_interrupted_rebalance_ec:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: SCHEME=ec /root/vitastor/tests/test_interrupted_rebalance.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_interrupted_rebalance_ec_imm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_interrupted_rebalance.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_minsize_1:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_minsize_1.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_move_reappear:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_move_reappear.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_rebalance_verify:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_rebalance_verify.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_rebalance_verify_imm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_rebalance_verify_ec:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=ec /root/vitastor/tests/test_rebalance_verify.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_rebalance_verify_ec_imm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_rm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_rm.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_snapshot:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_snapshot.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_snapshot_ec:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=ec /root/vitastor/tests/test_snapshot.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_splitbrain:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_splitbrain.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_write:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_write.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_write_xor:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=xor /root/vitastor/tests/test_write.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_write_no_same:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_write_no_same.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_pg_size_2:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: PG_SIZE=2 /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_ec:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: SCHEME=ec /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done

View File

@@ -0,0 +1,68 @@
#!/usr/bin/perl
use strict;
for my $line (<>)
{
if ($line =~ /\.\/(test_[^\.]+)/s)
{
chomp $line;
my $test_name = $1;
my $timeout = 3;
if ($test_name eq 'test_etcd_fail' || $test_name eq 'test_heal' || $test_name eq 'test_interrupted_rebalance')
{
$timeout = 10;
}
while ($line =~ /([^\s=]+)=(\S+)/gs)
{
if ($1 eq 'SCHEME' && $2 eq 'ec')
{
$test_name .= '_ec';
}
elsif ($1 eq 'SCHEME' && $2 eq 'xor')
{
$test_name .= '_xor';
}
elsif ($1 eq 'IMMEDIATE_COMMIT')
{
$test_name .= '_imm';
}
else
{
$test_name .= '_'.lc($1).'_'.$2;
}
}
$line =~ s!\./test_!/root/vitastor/tests/test_!;
# Gitea CI doesn't support artifacts yet, lol
#- name: Upload results
# uses: actions/upload-artifact\@v3
# if: always()
# with:
# name: ${test_name}_result
# path: |
# /root/vitastor/testdata
# !/root/vitastor/testdata/*.bin
# retention-days: 5
print <<"EOF"
$test_name:
runs-on: ubuntu-latest
needs: build
container: \${{env.TEST_IMAGE}}:\${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: $timeout
run: $line
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- \$i --------"
cat \$i
echo ""
done
EOF
;
}
}

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "0.8.8")
set(VERSION "0.8.9")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.8.8
VERSION ?= v0.8.9
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.8.8
image: vitalif/vitastor-csi:v0.8.9
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.8.8
image: vitalif/vitastor-csi:v0.8.9
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.8.8"
vitastorCSIDriverVersion = "0.8.9"
)
// Config struct fills the parameters of request or user input

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.8.8-1) unstable; urgency=medium
vitastor (0.8.9-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.8.8-1) unstable; urgency=medium
vitastor (0.8.9-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -34,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.8.8; \
cd vitastor-0.8.8; \
cp -r /root/vitastor vitastor-0.8.9; \
cd vitastor-0.8.9; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.8.orig.tar.xz vitastor-0.8.8; \
cd vitastor-0.8.8; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.9.orig.tar.xz vitastor-0.8.9; \
cd vitastor-0.8.9; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

Binary file not shown.

View File

@@ -10,25 +10,18 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
if (!new_pg_history[new_pg])
{
new_pg_history[new_pg] = {
osd_set_epochs: {},
osd_sets: {},
all_peers: {},
epoch: 0,
};
}
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
nh.osd_set_epochs[prev_pgs[old_pg].join(' ')] = { osd_set: prev_pgs[old_pg] };
nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg];
if (oh && oh.osd_sets && oh.osd_sets.length)
{
for (const pg of oh.osd_sets)
{
nh.osd_set_epochs[pg.join(' ')] = { osd_set: pg.map(osd_num => Number(osd_num)) };
}
}
if (oh && oh.osd_set_epochs && oh.osd_set_epochs.length)
{
for (const pg of oh.osd_set_epochs)
{
nh.osd_set_epochs[pg.osd_set.join(' ')] = { osd_set: pg.osd_set.map(osd_num => Number(osd_num)) };
nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num));
}
}
if (oh && oh.all_peers && oh.all_peers.length)
@@ -46,20 +39,20 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
function finish_pg_history(merged_history)
{
merged_history.osd_set_epochs = Object.values(merged_history.osd_set_epochs);
merged_history.osd_sets = Object.values(merged_history.osd_sets);
merged_history.all_peers = Object.values(merged_history.all_peers);
}
function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
function scale_pg_count(prev_pgs, real_prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
{
const old_pg_count = prev_pgs.length;
const old_pg_count = real_prev_pgs.length;
// Add all possibly intersecting PGs to the history of new PGs
if (!(new_pg_count % old_pg_count))
{
// New PG count is a multiple of old PG count
for (let i = 0; i < new_pg_count; i++)
{
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i % old_pg_count);
add_pg_history(new_pg_history, i, real_prev_pgs, prev_pg_history, i % old_pg_count);
finish_pg_history(new_pg_history[i]);
}
}
@@ -71,7 +64,7 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
{
for (let j = 0; j < mul; j++)
{
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i+j*new_pg_count);
add_pg_history(new_pg_history, i, real_prev_pgs, prev_pg_history, i+j*new_pg_count);
}
finish_pg_history(new_pg_history[i]);
}
@@ -83,7 +76,7 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
let merged_history = {};
for (let i = 0; i < old_pg_count; i++)
{
add_pg_history(merged_history, 1, prev_pgs, prev_pg_history, i);
add_pg_history(merged_history, 1, real_prev_pgs, prev_pg_history, i);
}
finish_pg_history(merged_history[1]);
for (let i = 0; i < new_pg_count; i++)
@@ -97,15 +90,15 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
new_pg_history[i] = null;
}
// Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
if (old_pg_count < new_pg_count)
if (prev_pgs.length < new_pg_count)
{
for (let i = old_pg_count; i < new_pg_count; i++)
for (let i = prev_pgs.length; i < new_pg_count; i++)
{
prev_pgs[i] = prev_pgs[i % old_pg_count];
prev_pgs[i] = prev_pgs[i % prev_pgs.length];
}
}
else if (old_pg_count > new_pg_count)
else if (prev_pgs.length > new_pg_count)
{
prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
prev_pgs.splice(new_pg_count, prev_pgs.length-new_pg_count);
}
}

View File

@@ -286,12 +286,7 @@ const etcd_tree = {
history: {
/* <pool_id>: {
<pg_id>: {
osd_set_epochs: {
osd_set: osd_num_t[],
min_epoch: uint64_t,
max_epoch: uint64_t,
}[],
osd_sets: osd_num_t[][], // outdated
osd_sets: osd_num_t[][],
all_peers: osd_num_t[],
epoch: uint64_t,
},
@@ -961,7 +956,7 @@ class Mon
return alive_set[this.rng() % alive_set.length];
}
save_new_pgs_txn(request, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
save_new_pgs_txn(save_to, request, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
{
const aff_osds = this.get_affinity_osds(this.state.config.pools[pool_id], up_osds, osd_tree);
const pg_items = {};
@@ -973,6 +968,18 @@ class Mon
osd_set,
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
};
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
prev_pgs[i].filter(osd_num => osd_num).length > 0)
{
pg_history[i] = pg_history[i] || {};
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
pg_history[i].osd_sets.push(prev_pgs[i]);
}
if (pg_history[i] && pg_history[i].osd_sets)
{
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
}
});
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
{
@@ -1002,14 +1009,14 @@ class Mon
});
}
}
this.state.config.pgs.items = this.state.config.pgs.items || {};
save_to.items = save_to.items || {};
if (!new_pgs.length)
{
delete this.state.config.pgs.items[pool_id];
delete save_to.items[pool_id];
}
else
{
this.state.config.pgs.items[pool_id] = pg_items;
save_to.items[pool_id] = pg_items;
}
}
@@ -1153,6 +1160,7 @@ class Mon
if (this.state.config.pgs.hash != tree_hash)
{
// Something has changed
const new_config_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
const etcd_request = { compare: [], success: [] };
for (const pool_id in (this.state.config.pgs||{}).items||{})
{
@@ -1173,7 +1181,7 @@ class Mon
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
} });
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
}
}
for (const pool_id in this.state.config.pools)
@@ -1227,7 +1235,7 @@ class Mon
return;
}
const new_pg_history = [];
PGUtil.scale_pg_count(prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
pg_history = new_pg_history;
}
for (const pg of prev_pgs)
@@ -1280,14 +1288,15 @@ class Mon
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
value: b64(JSON.stringify(this.state.pool.stats[pool_id])),
} });
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, optimize_result.int_pgs, pg_history);
this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, optimize_result.int_pgs, pg_history);
}
this.state.config.pgs.hash = tree_hash;
await this.save_pg_config(etcd_request);
new_config_pgs.hash = tree_hash;
await this.save_pg_config(new_config_pgs, etcd_request);
}
else
{
// Nothing changed, but we still want to recheck the distribution of primaries
let new_config_pgs;
let changed = false;
for (const pool_id in this.state.config.pools)
{
@@ -1307,31 +1316,35 @@ class Mon
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds, aff_osds);
if (pg_cfg.primary != new_primary)
{
if (!new_config_pgs)
{
new_config_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
}
console.log(
`Moving pool ${pool_id} (${pool_cfg.name || 'unnamed'}) PG ${pg_num}`+
` primary OSD from ${pg_cfg.primary} to ${new_primary}`
);
changed = true;
pg_cfg.primary = new_primary;
new_config_pgs.items[pool_id][pg_num].primary = new_primary;
}
}
}
}
if (changed)
{
await this.save_pg_config();
await this.save_pg_config(new_config_pgs);
}
}
}
async save_pg_config(etcd_request = { compare: [], success: [] })
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
{
etcd_request.compare.push(
{ key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
{ key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
);
etcd_request.success.push(
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(this.state.config.pgs)) } },
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_config_pgs)) } },
);
const res = await this.etcd_call('/kv/txn', etcd_request, this.config.etcd_mon_timeout, 0);
if (!res.succeeded)
@@ -1810,6 +1823,7 @@ function POST(url, body, timeout)
clearTimeout(timer_id);
let res_body = '';
res.setEncoding('utf8');
res.on('error', (error) => ok({ error }));
res.on('data', chunk => { res_body += chunk; });
res.on('end', () =>
{
@@ -1829,6 +1843,8 @@ function POST(url, body, timeout)
}
});
});
req.on('error', (error) => ok({ error }));
req.on('close', () => ok({ error: new Error('Connection closed prematurely') }));
req.write(body_text);
req.end();
});

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.8.8'
VERSION = '0.8.9'
LOG = logging.getLogger(__name__)

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.8.8/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.8$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.8.9/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.9$(rpm --eval '%dist').tar.gz *

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.8.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.9.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.8
Version: 0.8.9
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.8.el7.tar.gz
Source0: vitastor-0.8.9.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.8.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.9.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.8
Version: 0.8.9
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.8.el8.tar.gz
Source0: vitastor-0.8.9.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.8.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.9.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.8
Version: 0.8.9
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.8.el9.tar.gz
Source0: vitastor-0.8.9.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.8.8")
add_definitions(-DVERSION="0.8.9")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)

View File

@@ -307,6 +307,18 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
}
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_FREE)
{
if (!data_alloc->get_free_count() && big_to_flush > 0)
{
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for free space on the data device\n");
#endif
return;
}
flusher->release_trim();
PRIV(op)->wait_for = 0;
}
else
{
throw std::runtime_error("BUG: op->wait_for value is unexpected");

View File

@@ -160,6 +160,8 @@ struct __attribute__((__packed__)) dirty_entry
#define WAIT_JOURNAL 3
// Suspend operation until the next journal sector buffer is free
#define WAIT_JOURNAL_BUFFER 4
// Suspend operation until there is some free space on the data device
#define WAIT_FREE 5
struct fulfill_read_t
{
@@ -263,6 +265,7 @@ class blockstore_impl_t
struct journal_t journal;
journal_flusher_t *flusher;
int big_to_flush = 0;
int write_iodepth = 0;
bool live = false, queue_stall = false;

View File

@@ -201,6 +201,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
}
while (1)
{
if ((IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)) &&
IS_STABLE(dirty_it->second.state))
{
big_to_flush--;
}
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
dirty_it->second.location != UINT64_MAX)
{

View File

@@ -446,6 +446,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
{
inode_space_stats[dirty_it->first.oid.inode] += dsk.data_block_size;
}
big_to_flush++;
}
else if (IS_DELETE(dirty_it->second.state))
{
@@ -454,6 +455,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
sp -= dsk.data_block_size;
else
inode_space_stats.erase(dirty_it->first.oid.inode);
big_to_flush++;
}
}
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||

View File

@@ -271,6 +271,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
if (loc == UINT64_MAX)
{
// no space
if (big_to_flush > 0)
{
// hope that some space will be available after flush
flusher->request_trim();
PRIV(op)->wait_for = WAIT_FREE;
return 0;
}
cancel_all_writes(op, dirty_it, -ENOSPC);
return 2;
}

View File

@@ -88,7 +88,7 @@ struct rm_osd_t
for (auto & hist_item: pg_cfg.target_history)
{
int hist_size = 0, hist_rm = 0;
for (auto & old_osd: hist_item.osd_set)
for (auto & old_osd: hist_item)
{
if (old_osd != 0)
{
@@ -382,7 +382,7 @@ struct rm_osd_t
for (int i = 0; i < pg_cfg.target_history.size(); i++)
{
int hist_size = 0, hist_rm = 0;
for (auto & old_osd: pg_cfg.target_history[i].osd_set)
for (auto & old_osd: pg_cfg.target_history[i])
{
if (old_osd != 0)
{
@@ -406,15 +406,6 @@ struct rm_osd_t
}
if (update_pg_history)
{
json11::Json::array target_history;
for (auto & pgh: pg_cfg.target_history)
{
target_history.push_back(json11::Json::object {
{ "osd_set", pgh.osd_set },
{ "min_epoch", pgh.min_epoch },
{ "max_epoch", pgh.max_epoch },
});
}
std::string history_key = base64_encode(
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
@@ -425,7 +416,7 @@ struct rm_osd_t
{ "value", base64_encode(json11::Json(json11::Json::object {
{ "epoch", pg_cfg.epoch },
{ "all_peers", pg_cfg.all_peers },
{ "osd_set_epochs", target_history },
{ "osd_sets", pg_cfg.target_history },
}).dump()) },
} },
});

View File

@@ -743,15 +743,16 @@ resume_3:
erase_op(op);
return 1;
}
else if (op->retval != 0 && op->retval != -EPIPE)
else if (op->retval != 0 && op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
{
// Fatal error (not -EPIPE)
// Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
// FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
erase_op(op);
return 1;
}
else
{
// -EPIPE - clear the error and retry
// Non-fatal error - clear the error and retry
op->retval = 0;
if (op->needs_reslice)
{
@@ -1048,7 +1049,7 @@ resume_1:
uw_it->second.state = CACHE_DIRTY;
}
}
if (op->retval == -EPIPE)
if (op->retval == -EPIPE || op->retval == -EIO || op->retval == -ENOSPC)
{
// Retry later
op->parts.clear();
@@ -1119,13 +1120,13 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
{
// Operation failed, retry
part->flags |= PART_ERROR;
if (!op->retval || op->retval == -EPIPE)
if (!op->retval || op->retval == -EPIPE || part->op.reply.hdr.retval == -EIO)
{
// Don't overwrite other errors with -EPIPE
// Error priority: EIO > ENOSPC > EPIPE
op->retval = part->op.reply.hdr.retval;
}
int stop_fd = -1;
if (op->retval != -EINTR && op->retval != -EIO)
if (op->retval != -EINTR && op->retval != -EIO && op->retval != -ENOSPC)
{
stop_fd = part->op.peer_fd;
fprintf(
@@ -1133,21 +1134,25 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
}
else
{
fprintf(
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
}
// All next things like timer, continue_sync/rw and stop_client may affect the operation again
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
// FIXME postpone such things to set_immediate here to avoid bugs
if (part->op.reply.hdr.retval == -EPIPE)
// Mark op->up_wait = true to retry operation after a short pause (not immediately)
op->up_wait = true;
if (!retry_timeout_id)
{
// Mark op->up_wait = true before stopping the client
op->up_wait = true;
if (!retry_timeout_id)
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
{
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
{
retry_timeout_id = 0;
continue_ops(true);
});
}
retry_timeout_id = 0;
continue_ops(true);
});
}
if (op->inflight_count == 0)
{

View File

@@ -96,7 +96,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
}
for (auto & hist_item: pg.target_history)
{
for (auto pg_osd: hist_item.osd_set)
for (auto pg_osd: hist_item)
{
if (pg_osd != 0)
{
@@ -106,14 +106,11 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
}
for (osd_num_t peer_osd: all_peers)
{
if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
{
r->list_osds.push_back((inode_list_osd_t){
.pg = r,
.osd_num = peer_osd,
.sent = false,
});
}
r->list_osds.push_back((inode_list_osd_t){
.pg = r,
.osd_num = peer_osd,
.sent = false,
});
}
}
else

View File

@@ -54,6 +54,13 @@ void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, in
ev.events = (wr ? EPOLLOUT : 0) | EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
{
if (errno == ENOENT)
{
// The FD is probably already closed
epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL);
epoll_handlers.erase(fd);
return;
}
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers[fd] = handler;

View File

@@ -902,32 +902,9 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
history_set.insert(it, pg_osd_num);
}
}
pg_history_set_t epoch_set = { .osd_set = history_set };
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), epoch_set);
if (it == pg_cfg.target_history.end() || *it != epoch_set)
pg_cfg.target_history.insert(it, epoch_set);
}
// Newer format with epochs
for (auto hist_item: value["osd_set_epochs"].array_items())
{
pg_history_set_t history_set;
history_set.min_epoch = hist_item["min_epoch"].uint64_value();
history_set.max_epoch = hist_item["max_epoch"].uint64_value();
if (history_set.max_epoch < history_set.min_epoch)
{
history_set.max_epoch = 0;
history_set.min_epoch = 0;
}
for (auto pg_osd: hist_item["osd_set"].array_items())
{
history_set.osd_set.push_back(pg_osd.uint64_value());
}
if (history_set.max_epoch || history_set.osd_set.size())
{
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
if (it == pg_cfg.target_history.end() || *it != history_set)
pg_cfg.target_history.insert(it, history_set);
}
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
if (it == pg_cfg.target_history.end() || *it != history_set)
pg_cfg.target_history.insert(it, history_set);
}
// Include these additional OSDs when peering the PG
for (auto pg_osd: value["all_peers"].array_items())

View File

@@ -33,7 +33,7 @@ struct pg_config_t
bool exists;
osd_num_t primary;
std::vector<osd_num_t> target_set;
std::vector<pg_history_set_t> target_history;
std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers;
bool pause;
osd_num_t cur_primary;

View File

@@ -103,7 +103,10 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
cl->recv_list.eat(result);
if (cl->recv_list.done >= cl->recv_list.count)
{
handle_finished_read(cl);
if (!handle_finished_read(cl))
{
goto fin;
}
}
}
if (result >= cl->read_iov.iov_len)

View File

@@ -192,7 +192,6 @@ class osd_t
void reset_stats();
json11::Json get_statistics();
void report_statistics();
void add_pg_history(pg_t & pg);
void report_pg_state(pg_t & pg);
void report_pg_states();
void apply_pg_count();

View File

@@ -674,7 +674,7 @@ void osd_t::apply_pg_config()
}
for (auto & hist_item: pg_cfg.target_history)
{
for (auto pg_osd: hist_item.osd_set)
for (auto pg_osd: hist_item)
{
if (pg_osd != 0)
{
@@ -812,11 +812,21 @@ void osd_t::report_pg_states()
pg_it->second.cur_state != 0)
{
pg_state_exists = true;
if (pg.state == PG_OFFLINE && pg_it->second.cur_primary != this->osd_num)
{
// Nothing to check or report, PG is already taken over by another OSD
continue;
}
}
}
}
if (!pg_state_exists)
{
if (pg.state == PG_OFFLINE)
{
// Nothing to check or report, PG is already stopped
continue;
}
// Check that the PG key does not exist
// Failed check indicates an unsuccessful PG lock attempt in this case
checks.push_back(json11::Json::object {
@@ -868,40 +878,11 @@ void osd_t::report_pg_states()
// Prevent race conditions (for the case when the monitor is updating this key at the same time)
pg.history_changed = false;
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
json11::Json::array target_history;
for (auto & pgh: pg.target_history)
{
target_history.push_back(json11::Json::object {
{ "osd_set", pgh.osd_set },
{ "min_epoch", pgh.min_epoch },
{ "max_epoch", pgh.max_epoch },
});
}
std::vector<osd_num_t> all_peers;
for (auto peer_osd: pg.all_peers)
{
bool found = false;
for (auto target_peer: pg.target_set)
{
if (target_peer == peer_osd)
{
found = true;
break;
}
}
if (!found)
{
all_peers.push_back(peer_osd);
}
}
json11::Json::object history_value = {
{ "epoch", pg.epoch },
{ "osd_set_epochs", target_history },
{ "all_peers", pg.all_peers },
{ "osd_sets", pg.target_history },
};
if (all_peers.size())
{
history_value["all_peers"] = all_peers;
}
checks.push_back(json11::Json::object {
{ "target", "MOD" },
{ "key", history_key },
@@ -930,6 +911,15 @@ void osd_t::report_pg_states()
{
etcd_reporting_pg_state = false;
if (!data["succeeded"].bool_value())
{
std::string rpgnames = "";
for (auto pp: reporting_pgs)
{
rpgnames += (rpgnames.size() ? ", " : "")+std::to_string(pp.pool_pg_num.pool_id)+"/"+std::to_string(pp.pool_pg_num.pg_num);
}
printf("Error reporting PG %s states, will repeat the attempt: %s\n", rpgnames.c_str(), err.c_str());
}
if (!data["succeeded"].bool_value())
{
// One of PG state updates failed, put dirty flags back
for (auto pp: reporting_pgs)

View File

@@ -287,25 +287,6 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
// Check if the object is deleted
bool is_deleted = false;
pool_id_t pool_id = INODE_POOL(op->oid.inode);
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
if (pool_cfg_it != st_cli.pool_config.end())
{
pg_num_t pg_num = (op->oid.stripe/pool_cfg_it->second.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
if (pg_it != pgs.end())
{
pg_osd_set_state_t *object_state;
get_object_osd_set(pg_it->second, op->oid, pg_it->second.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, but not from all OSDs - delete remaining copies
is_deleted = true;
}
}
}
op->osd_op = new osd_op_t();
op->osd_op->op_type = OSD_OP_OUT;
op->osd_op->req = (osd_any_op_t){
@@ -313,7 +294,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = 1,
.opcode = (uint64_t)(is_deleted ? OSD_OP_DELETE : OSD_OP_WRITE),
.opcode = OSD_OP_WRITE,
},
.inode = op->oid.inode,
.offset = op->oid.stripe,

View File

@@ -3,8 +3,6 @@
#pragma once
#include <vector>
#define POOL_SCHEME_REPLICATED 1
#define POOL_SCHEME_XOR 2
#define POOL_SCHEME_EC 3
@@ -40,25 +38,3 @@ inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
{
return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
}
struct pg_history_set_t
{
std::vector<osd_num_t> osd_set;
uint64_t min_epoch, max_epoch;
};
inline bool operator == (const pg_history_set_t & a, const pg_history_set_t & b)
{
return a.min_epoch == b.min_epoch && a.max_epoch == b.max_epoch && a.osd_set == b.osd_set;
}
inline bool operator != (const pg_history_set_t & a, const pg_history_set_t & b)
{
return a.min_epoch != b.min_epoch || a.max_epoch != b.max_epoch || a.osd_set != b.osd_set;
}
inline bool operator < (const pg_history_set_t & a, const pg_history_set_t & b)
{
return a.min_epoch < b.min_epoch || a.min_epoch == b.min_epoch &&
(a.max_epoch < b.max_epoch || a.max_epoch == b.max_epoch && a.osd_set < b.osd_set);
}

View File

@@ -191,7 +191,7 @@ struct __attribute__((__packed__)) osd_op_rw_t
uint64_t inode;
// offset
uint64_t offset;
// length
// length. 0 means to read all bitmaps of the specified range, but no data.
uint32_t len;
// flags (for future)
uint32_t flags;

View File

@@ -231,7 +231,7 @@ void osd_t::start_pg_peering(pg_t & pg)
for (auto & history_set: pg.target_history)
{
bool found = true;
for (auto history_osd: history_set.osd_set)
for (auto history_osd: history_set)
{
if (history_osd != 0)
{
@@ -471,71 +471,61 @@ void osd_t::finish_stop_pg(pg_t & pg)
report_pg_state(pg);
}
static int count_nonzero_osds(const std::vector<osd_num_t> & v)
{
int n = 0;
for (auto & osd_num: v)
{
if (osd_num != 0)
{
n++;
}
}
return n;
}
void osd_t::report_pg_state(pg_t & pg)
{
pg.print_state();
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if ((pg.state == PG_ACTIVE || pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) &&
(pg.target_history.size() != 1 ||
pg.target_history[0].osd_set != pg.target_set ||
pg.target_history[0].min_epoch != 0 ||
pg.target_history[0].max_epoch != pg.epoch ||
pg.all_peers.size() > count_nonzero_osds(pg.target_set)))
if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
{
// Clear history of active+clean PGs
pg.history_changed = true;
pg.target_history.clear();
pg.target_history.push_back((pg_history_set_t){
.osd_set = pg.cur_set,
.min_epoch = 0,
.max_epoch = pg.epoch,
});
if (pg.state == PG_ACTIVE)
{
pg.all_peers.clear();
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
pg.all_peers.push_back(pg_osd);
}
}
else
{
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
std::set<osd_num_t> dead_peers(pg.all_peers.begin(), pg.all_peers.end());
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
dead_peers.insert(pg_osd);
}
pg.all_peers.clear();
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
}
pg.all_peers = pg.target_set;
std::sort(pg.all_peers.begin(), pg.all_peers.end());
pg.cur_peers = pg.target_set;
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
pg_cfg.target_history = pg.target_history;
pg_cfg.all_peers = pg.all_peers;
}
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
{
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
if (pg.target_history.size())
{
pg.history_changed = true;
pg.target_history.clear();
}
std::set<osd_num_t> dead_peers;
for (auto pg_osd: pg.all_peers)
{
dead_peers.insert(pg_osd);
}
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{
dead_peers.insert(pg_osd);
}
}
auto new_all_peers = std::vector<osd_num_t>(dead_peers.begin(), dead_peers.end());
if (pg.all_peers != new_all_peers)
{
pg.history_changed = true;
pg.all_peers = new_all_peers;
}
pg.cur_peers.clear();
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{
pg.cur_peers.push_back(pg_osd);
}
}
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
pg_cfg.target_history = pg.target_history;
pg_cfg.all_peers = pg.all_peers;
@@ -546,51 +536,3 @@ void osd_t::report_pg_state(pg_t & pg)
}
report_pg_states();
}
void osd_t::add_pg_history(pg_t & pg)
{
bool epoch_already_reported = false;
int max_epoch_pos = -1;
for (int i = pg.target_history.size()-1; i >= 0; i--)
{
if (pg.target_history[i].min_epoch > pg.epoch)
{
printf("[PG %u/%u] Invalid PG history: there is an entry with min_epoch (%lu) > current epoch (%lu)\n",
pg.pool_id, pg.pg_num, pg.target_history[i].min_epoch, pg.epoch);
force_stop(1);
return;
}
if (max_epoch_pos < 0 || pg.target_history[i].max_epoch > pg.target_history[max_epoch_pos].max_epoch)
{
max_epoch_pos = i;
}
if (pg.target_history[i].min_epoch <= pg.epoch &&
pg.target_history[i].max_epoch >= pg.epoch)
{
if (pg.target_history[i].osd_set != pg.cur_set)
{
printf("[PG %u/%u] Invalid target_history: epoch %lu has another OSD set already registered\n", pg.pool_id, pg.pg_num, pg.epoch);
force_stop(1);
return;
}
// Already reported
epoch_already_reported = true;
break;
}
}
if (!epoch_already_reported)
{
if (max_epoch_pos >= 0 && pg.target_history[max_epoch_pos].osd_set == pg.cur_set)
{
pg.target_history[max_epoch_pos].max_epoch = pg.epoch;
}
else
{
pg.target_history.push_back((pg_history_set_t){
.osd_set = pg.cur_set,
.min_epoch = pg.epoch,
.max_epoch = pg.epoch,
});
}
}
}

View File

@@ -52,7 +52,6 @@ struct pg_obj_state_check_t
void walk();
void start_object();
void recheck_version_osd_set();
void handle_version();
void finish_object();
};
@@ -85,7 +84,6 @@ void pg_obj_state_check_t::walk()
pg->state = PG_INCOMPLETE | PG_HAS_INVALID;
return;
}
// Activate PG
if (pg->pg_cursize < pg->pg_size)
{
// Activate as degraded
@@ -110,85 +108,13 @@ void pg_obj_state_check_t::start_object()
n_unstable = n_invalid = 0;
}
// FIXME: Put this under a feature flag
// FIXME: Implement OSD 'cookies' to be fool-proof so that if an OSD is wiped and
// recreated it doesn't also wipe all other data
void pg_obj_state_check_t::recheck_version_osd_set()
{
uint64_t epoch = (last_ver >> (64-PG_EPOCH_BITS));
if (!pg->epoch_sizes_differ && n_copies >= pg->pg_size)
{
// Enough copies
return;
}
auto epoch_it = pg->target_by_epoch.lower_bound(epoch);
if (epoch_it == pg->target_by_epoch.end() || epoch_it->second.min_epoch > epoch)
{
// Epoch info not found
return;
}
if (pg->epoch_sizes_differ && n_copies >= epoch_it->second.osd_set.size())
{
// For the (unlikely) case of PG size change - enough copies
return;
}
// Recheck version against the OSD set corresponding to epoch if it's known
if (epoch_it != pg->target_by_epoch.end() && epoch_it->second.min_epoch <= epoch)
{
for (int j = 0; j < epoch_it->second.osd_set.size(); j++)
{
osd_num_t cur_osd = epoch_it->second.osd_set[j];
bool found = false;
for (int i = ver_start; i < ver_end; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
if (!found)
{
// Check if a newer version is present on the same OSD and masks the older one
// It happens for overwritten replicas in the following case:
// Version 1 is present on OSD 1,2,3
// Client tries to write Version 2
// OSD 3 succeeds to write Version 2, others don't. OSD 3 crashes, then starts again
// OSD 1 sees: version 1 on OSD 1,2 and version 2 on OSD 3
// (version 1 on OSD 3 is already masked/removed)
// Version 1 is not present on a full set, but it must not be removed
if (replicated)
{
for (int i = obj_start; i < ver_start; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
}
if (!found)
{
// Object is missing from one of the OSDs of that set.
// This means it's deleted or moved and we can safely drop this version.
target_ver = 0;
break;
}
}
}
}
}
void pg_obj_state_check_t::handle_version()
{
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
{
// Version is either stable or recoverable
ver_end = list_pos;
target_ver = last_ver;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
ver_end = list_pos;
}
if (!target_ver)
{
@@ -252,8 +178,6 @@ void pg_obj_state_check_t::finish_object()
// Version is either stable or recoverable
target_ver = last_ver;
ver_end = list_pos;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
}
obj_end = list_pos;
// Remember the decision
@@ -307,23 +231,11 @@ void pg_obj_state_check_t::finish_object()
}
}
}
if (!target_ver && (n_unstable >= obj_end-obj_start))
if (!target_ver)
{
return;
}
if (!target_ver)
{
// Object is present, but should not be :) i.e. it's a deleted object that reappeared
if (log_level > 1)
{
printf("Object is deleted: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state = OBJ_DELETED;
pg->state = pg->state | PG_HAS_MISPLACED;
// To record all versions as outdated:
ver_end = obj_start;
}
else if (!replicated && n_roles < pg->pg_data_size)
if (!replicated && n_roles < pg->pg_data_size)
{
if (log_level > 1)
{
@@ -351,7 +263,7 @@ void pg_obj_state_check_t::finish_object()
pg->state = pg->state | PG_HAS_MISPLACED;
}
if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
log_level > 2 && (state & (OBJ_MISPLACED | OBJ_DELETED)))
log_level > 2 && (state & OBJ_MISPLACED))
{
for (int i = obj_start; i < obj_end; i++)
{
@@ -360,9 +272,9 @@ void pg_obj_state_check_t::finish_object()
}
}
pg->total_count++;
osd_set.clear();
if (target_ver != 0 && (state != 0 || ver_end < obj_end))
if (state != 0 || ver_end < obj_end)
{
osd_set.clear();
for (int i = ver_start; i < ver_end; i++)
{
osd_set.push_back((pg_obj_loc_t){
@@ -385,8 +297,7 @@ void pg_obj_state_check_t::finish_object()
break;
}
}
if (j >= osd_set.size() && ((state & OBJ_DELETED) ||
pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num))
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
{
osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK),
@@ -401,11 +312,7 @@ void pg_obj_state_check_t::finish_object()
}
}
}
if (state & OBJ_DELETED)
{
pg->ver_override[oid] = max_ver;
}
else if (target_ver < max_ver)
if (target_ver < max_ver)
{
pg->ver_override[oid] = target_ver;
}
@@ -459,7 +366,6 @@ void pg_obj_state_check_t::finish_object()
}
else
{
assert(it->second.state == state);
it->second.object_count++;
}
if (state & OBJ_INCOMPLETE)
@@ -480,34 +386,6 @@ void pg_obj_state_check_t::finish_object()
// FIXME: Write at least some tests for this function
void pg_t::calc_object_states(int log_level)
{
// Calculate intersections of target_history with cur_peers
for (auto & history_item: target_history)
{
if (history_item.max_epoch)
{
pg_history_set_t & set_copy = target_by_epoch[history_item.max_epoch];
set_copy.min_epoch = history_item.min_epoch;
set_copy.max_epoch = history_item.max_epoch;
for (int i = 0; i < history_item.osd_set.size(); i++)
{
if (history_item.osd_set[i] != 0)
{
for (int j = 0; j < cur_set.size(); j++)
{
if (cur_set[j] == history_item.osd_set[i])
{
set_copy.osd_set.push_back(history_item.osd_set[i]);
break;
}
}
}
}
if (set_copy.osd_set.size() != pg_size)
{
epoch_sizes_differ = true;
}
}
}
// Copy all object lists into one array
pg_obj_state_check_t st;
st.log_level = log_level;
@@ -544,18 +422,10 @@ void pg_t::calc_object_states(int log_level)
std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states
st.walk();
target_by_epoch.clear(); // needed only in this function
if (this->state != PG_ACTIVE)
{
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
epoch++;
for (auto & pgh: target_history)
{
if (epoch <= pgh.max_epoch)
{
epoch = pgh.max_epoch+1;
}
}
}
if (log_level > 0)
{

View File

@@ -89,9 +89,7 @@ struct pg_t
// epoch number - should increase with each non-clean activation of the PG
uint64_t epoch = 0, reported_epoch = 0;
// target history and all potential peers
std::vector<pg_history_set_t> target_history;
std::map<uint64_t, pg_history_set_t> target_by_epoch;
bool epoch_sizes_differ = false;
std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers;
bool history_changed = false;
// peer list from the last peering event

View File

@@ -186,10 +186,22 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
cur_op->reply.rw.bitmap_len = 0;
{
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
for (int role = 0; role < op_data->pg_data_size; role++)
if (cur_op->req.rw.len == 0)
{
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
// len=0 => bitmap read
for (int role = 0; role < op_data->pg_data_size; role++)
{
op_data->stripes[role].read_start = 0;
op_data->stripes[role].read_end = UINT32_MAX;
}
}
else
{
for (int role = 0; role < op_data->pg_data_size; role++)
{
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
}
}
// Determine version
auto vo_it = pg.ver_override.find(op_data->oid);
@@ -199,21 +211,6 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
{
// PG may be degraded or have misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
{
// Object is deleted, just return zeroes
cur_op->reply.rw.version = 0;
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
uint64_t zero_len = cur_op->reply.rw.bitmap_len + cur_op->req.rw.len;
while (zero_len >= 0)
{
uint64_t cur_zero_len = zero_buffer_size > zero_len ? zero_len : zero_buffer_size;
cur_op->iov.push_back(zero_buffer, cur_zero_len);
zero_len -= cur_zero_len;
}
finish_op(cur_op, cur_op->req.rw.len);
return;
}
}
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{
@@ -305,7 +302,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
report_pg_state(pg);
}
}
else if (object_state->state & (OBJ_MISPLACED | OBJ_DELETED))
else if (object_state->state & OBJ_MISPLACED)
{
this->misplaced_objects--;
pg.misplaced_objects.erase(oid);
@@ -344,6 +341,12 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
{
finish_op(cur_op, -EBUSY);
return;
}
if (!check_write_queue(cur_op, pg))
{
return;
@@ -351,18 +354,11 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
resume_1:
// Determine which OSDs contain this object and delete it
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
{
op_data->fact_ver = pg.ver_override[op_data->oid];
}
else
{
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
resume_2:
op_data->st = 2;
return;
}
op_data->st = 2;
return;
resume_3:
if (op_data->errors > 0)
{

View File

@@ -133,12 +133,6 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
pg_osd_set_state_t *object_state;
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, zero out the bitmap
memset((uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size, 0, clean_entry_bitmap_size);
continue;
}
if (pg.scheme == POOL_SCHEME_REPLICATED)
{
osd_num_t read_target = 0;

View File

@@ -119,19 +119,17 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
n_subops++;
}
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep) && zero_read >= 0)
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep))
n_subops = 1;
else
zero_read = -1;
osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0;
op_data->done = op_data->errors = op_data->errcode = 0;
op_data->n_subops = n_subops;
if (n_subops > 0)
{
op_data->subops = new osd_op_t[n_subops];
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
assert(sent == n_subops);
}
op_data->subops = subops;
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
assert(sent == n_subops);
}
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,
@@ -153,6 +151,13 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
{
int stripe_num = rep ? 0 : role;
osd_op_t *subop = op_data->subops + i;
uint32_t subop_len = wr
? stripes[stripe_num].write_end - stripes[stripe_num].write_start
: stripes[stripe_num].read_end - stripes[stripe_num].read_start;
if (!wr && stripes[stripe_num].read_end == UINT32_MAX)
{
subop_len = 0;
}
if (role_osd_num == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
@@ -171,7 +176,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
},
.version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.len = subop_len,
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
.bitmap = stripes[stripe_num].bmp_buf,
});
@@ -201,7 +206,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
},
.version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.len = subop_len,
.attr_len = wr ? clean_entry_bitmap_size : 0,
};
#ifdef OSD_DEBUG
@@ -220,9 +225,9 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
}
else
{
if (stripes[stripe_num].read_end > stripes[stripe_num].read_start)
if (subop_len > 0)
{
subop->iov.push_back(stripes[stripe_num].read_buf, stripes[stripe_num].read_end - stripes[stripe_num].read_start);
subop->iov.push_back(stripes[stripe_num].read_buf, subop_len);
}
}
subop->callback = [cur_op, this](osd_op_t *subop)

View File

@@ -81,7 +81,7 @@ resume_1:
if (!cur_op->rmw_buf)
{
// Refuse partial overwrite of an incomplete object
cur_op->reply.hdr.retval = -EINVAL;
cur_op->reply.hdr.retval = -EIO;
goto continue_others;
}
}
@@ -156,13 +156,21 @@ resume_3:
{
// Report newer epoch before writing
// FIXME: We don't have to report all changed PG states here
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if (pg.state != PG_ACTIVE)
{
// Check that current OSD set is in history and/or add it there
add_pg_history(pg);
std::vector<osd_num_t> history_set;
for (auto peer_osd: pg.cur_set)
if (peer_osd != 0)
history_set.push_back(peer_osd);
std::sort(history_set.begin(), history_set.end());
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
if (it == pg.target_history.end() || *it != history_set)
pg.target_history.insert(it, history_set);
}
pg.history_changed = true;
report_pg_state(pg);
report_pg_states();
resume_10:
if (pg.epoch > pg.reported_epoch)
{
@@ -189,6 +197,11 @@ resume_5:
}
if (op_data->errors > 0)
{
// FIXME: Handle ENOSPC. If one of the subops fail with ENOSPC here,
// next writes to the same object will also fail because they'll try
// to overwrite the same version number which will result in EEXIST.
// To fix it, we should mark the object as degraded for replicas,
// and rollback successful part updates in case of EC.
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
return;
}

View File

@@ -28,7 +28,9 @@ static inline void extend_read(uint32_t start, uint32_t end, osd_rmw_stripe_t &
}
else
{
if (stripe.read_end < end)
if (stripe.read_end < end && end != UINT32_MAX ||
// UINT32_MAX means that stripe only needs bitmap, end != 0 => needs also data
stripe.read_end == UINT32_MAX && end != 0)
stripe.read_end = end;
if (stripe.read_start > start)
stripe.read_start = start;
@@ -105,24 +107,30 @@ void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size, uint32_t bi
}
else if (prev >= 0)
{
assert(stripes[role].read_start >= stripes[prev].read_start &&
stripes[role].read_start >= stripes[other].read_start);
memxor(
(uint8_t*)stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
(uint8_t*)stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
);
if (stripes[role].read_end != UINT32_MAX)
{
assert(stripes[role].read_start >= stripes[prev].read_start &&
stripes[role].read_start >= stripes[other].read_start);
memxor(
(uint8_t*)stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
(uint8_t*)stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
);
}
memxor(stripes[prev].bmp_buf, stripes[other].bmp_buf, stripes[role].bmp_buf, bitmap_size);
prev = -1;
}
else
{
assert(stripes[role].read_start >= stripes[other].read_start);
memxor(
stripes[role].read_buf,
(uint8_t*)stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
);
if (stripes[role].read_end != UINT32_MAX)
{
assert(stripes[role].read_start >= stripes[other].read_start);
memxor(
stripes[role].read_buf,
(uint8_t*)stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
);
}
memxor(stripes[role].bmp_buf, stripes[other].bmp_buf, stripes[role].bmp_buf, bitmap_size);
}
}
@@ -356,20 +364,23 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
uint64_t read_start = 0, read_end = 0;
auto recover_seq = [&]()
{
int orig = 0;
for (int other = 0; other < pg_size; other++)
if (read_end != UINT32_MAX)
{
if (stripes[other].read_end != 0 && !stripes[other].missing)
int orig = 0;
for (int other = 0; other < pg_size && orig < pg_minsize; other++)
{
assert(stripes[other].read_start <= read_start);
assert(stripes[other].read_end >= read_end);
data_ptrs[orig++] = (uint8_t*)stripes[other].read_buf + (read_start - stripes[other].read_start);
if (stripes[other].read_end != 0 && !stripes[other].missing)
{
assert(stripes[other].read_start <= read_start);
assert(stripes[other].read_end >= read_end);
data_ptrs[orig++] = (uint8_t*)stripes[other].read_buf + (read_start - stripes[other].read_start);
}
}
ec_encode_data(
read_end-read_start, pg_minsize, wanted, dectable + wanted_base*32*pg_minsize,
data_ptrs, data_ptrs + pg_minsize
);
}
ec_encode_data(
read_end-read_start, pg_minsize, wanted, dectable + wanted_base*32*pg_minsize,
data_ptrs, data_ptrs + pg_minsize
);
wanted_base += wanted;
wanted = 0;
};
@@ -391,6 +402,32 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
{
recover_seq();
}
// Recover bitmaps
if (bitmap_size > 0)
{
for (int role = 0; role < pg_minsize; role++)
{
if (stripes[role].read_end != 0 && stripes[role].missing)
{
data_ptrs[pg_minsize + (wanted++)] = (uint8_t*)stripes[role].bmp_buf;
}
}
if (wanted > 0)
{
int orig = 0;
for (int other = 0; other < pg_size && orig < pg_minsize; other++)
{
if (stripes[other].read_end != 0 && !stripes[other].missing)
{
data_ptrs[orig++] = (uint8_t*)stripes[other].bmp_buf;
}
}
ec_encode_data(
bitmap_size, pg_minsize, wanted, dectable,
data_ptrs, data_ptrs + pg_minsize
);
}
}
}
#else
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
@@ -412,7 +449,8 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
if (stripes[role].read_end != 0 && stripes[role].missing)
{
recovered = true;
if (stripes[role].read_end > stripes[role].read_start)
if (stripes[role].read_end > stripes[role].read_start &&
stripes[role].read_end != UINT32_MAX)
{
for (int other = 0; other < pg_size; other++)
{
@@ -531,7 +569,8 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
uint64_t buf_size = add_size;
for (int role = 0; role < read_pg_size; role++)
{
if (stripes[role].read_end != 0)
if (stripes[role].read_end != 0 &&
stripes[role].read_end != UINT32_MAX)
{
buf_size += stripes[role].read_end - stripes[role].read_start;
}
@@ -541,7 +580,8 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
uint64_t buf_pos = add_size;
for (int role = 0; role < read_pg_size; role++)
{
if (stripes[role].read_end != 0)
if (stripes[role].read_end != 0 &&
stripes[role].read_end != UINT32_MAX)
{
stripes[role].read_buf = (uint8_t*)buf + buf_pos;
buf_pos += stripes[role].read_end - stripes[role].read_start;

View File

@@ -23,6 +23,7 @@ struct osd_rmw_stripe_t
void *read_buf, *write_buf;
void *bmp_buf;
uint32_t req_start, req_end;
// read_end=UINT32_MAX means to only read bitmap, but not data
uint32_t read_start, read_end;
uint32_t write_start, write_end;
bool missing;

View File

@@ -27,6 +27,7 @@ void test13();
void test14();
void test15(bool second);
void test16();
void test_recover_22_d2();
int main(int narg, char *args[])
{
@@ -61,6 +62,8 @@ int main(int narg, char *args[])
test15(true);
// Test 16
test16();
// Test 17
test_recover_22_d2();
// End
printf("all ok\n");
return 0;
@@ -1045,7 +1048,12 @@ void test16()
assert(stripes[3].read_buf == (uint8_t*)read_buf+2*128*1024);
set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
memcpy(stripes[3].read_buf, rmw_buf, 128*1024);
memset(stripes[0].bmp_buf, 0xa8, bmp);
memset(stripes[2].bmp_buf, 0xb7, bmp);
assert(bitmaps[1] == 0xFFFFFFFF);
assert(bitmaps[3] == 0xF1F1F1F1);
reconstruct_stripes_ec(stripes, 4, 2, bmp);
assert(*(uint32_t*)stripes[3].bmp_buf == 0xF1F1F1F1);
assert(bitmaps[0] == 0xFFFFFFFF);
check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
free(read_buf);
@@ -1054,3 +1062,47 @@ void test16()
free(write_buf);
use_ec(4, 2, false);
}
/***
17. EC 2+2 recover second data block
***/
void test_recover_22_d2()
{
const int bmp = 128*1024 / 4096 / 8;
use_ec(4, 2, true);
osd_num_t osd_set[4] = { 1, 0, 3, 4 };
osd_rmw_stripe_t stripes[4] = {};
unsigned bitmaps[4] = { 0 };
// Read 0-256K
split_stripes(2, 128*1024, 0, 256*1024, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
uint8_t *data_buf = (uint8_t*)malloc_or_die(128*1024*4);
for (int i = 0; i < 4; i++)
{
stripes[i].read_start = stripes[i].req_start;
stripes[i].read_end = stripes[i].req_end;
stripes[i].read_buf = data_buf + i*128*1024;
stripes[i].bmp_buf = bitmaps + i;
}
// Read using parity
assert(extend_missing_stripes(stripes, osd_set, 2, 4) == 0);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
bitmaps[0] = 0xffffffff;
bitmaps[2] = 0;
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
set_pattern(stripes[2].read_buf, 128*1024, PATTERN1^PATTERN2);
// Reconstruct
reconstruct_stripes_ec(stripes, 4, 2, bmp);
check_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
assert(bitmaps[1] == 0xFFFFFFFF);
free(data_buf);
// Done
use_ec(4, 2, false);
}

View File

@@ -32,7 +32,6 @@
#define OBJ_DEGRADED 0x02
#define OBJ_INCOMPLETE 0x04
#define OBJ_MISPLACED 0x08
#define OBJ_DELETED 0x10
#define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000

View File

@@ -218,6 +218,7 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
}
}
// FIXME: Fix thread safety of the driver - now it segfaults when iothread is enabled in QEMU
static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
{
aio_set_fd_handler(ctx, fd,

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.8.8
Version: 0.8.9
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -36,12 +36,12 @@ for i in $(seq 2 $ETCD_COUNT); do
ETCD_URL="$ETCD_URL,http://$ETCD_IP:$((ETCD_PORT+2*i-2))"
ETCD_CLUSTER="$ETCD_CLUSTER,etcd$i=http://$ETCD_IP:$((ETCD_PORT+2*i-1))"
done
ETCDCTL="${ETCD}ctl --endpoints=$ETCD_URL"
ETCDCTL="${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=5s --command-timeout=10s"
start_etcd()
{
local i=$1
$ETCD -name etcd$i --data-dir ./testdata/etcd$i \
ionice -c2 -n0 $ETCD -name etcd$i --data-dir ./testdata/etcd$i \
--advertise-client-urls http://$ETCD_IP:$((ETCD_PORT+2*i-2)) --listen-client-urls http://$ETCD_IP:$((ETCD_PORT+2*i-2)) \
--initial-advertise-peer-urls http://$ETCD_IP:$((ETCD_PORT+2*i-1)) --listen-peer-urls http://$ETCD_IP:$((ETCD_PORT+2*i-1)) \
--initial-cluster-token vitastor-tests-etcd --initial-cluster-state new \
@@ -53,9 +53,12 @@ start_etcd()
for i in $(seq 1 $ETCD_COUNT); do
start_etcd $i
done
if [ $ETCD_COUNT -gt 1 ]; then
sleep 1
fi
for i in {1..30}; do
${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=1s --command-timeout=1s member list >/dev/null && break
if [[ $i = 30 ]]; then
format_error "Failed to start etcd"
fi
done
echo leak:fio >> testdata/lsan-suppress.txt
echo leak:tcmalloc >> testdata/lsan-suppress.txt

View File

@@ -39,19 +39,19 @@ done
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
(while true; do node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 || true; done) &>./testdata/mon.log &
MON_PID=$!
if [ "$SCHEME" = "ec" ]; then
PG_SIZE=${PG_SIZE:-5}
PG_MINSIZE=${PG_MINSIZE:-3}
PG_DATA_SIZE=$PG_MINSIZE
POOLCFG='"scheme":"ec","parity_chunks":'$((PG_SIZE-PG_MINSIZE))
PG_MINSIZE=${PG_MINSIZE:-4}
PG_DATA_SIZE=${PG_DATA_SIZE:-3}
POOLCFG='"scheme":"ec","parity_chunks":'$((PG_SIZE-PG_DATA_SIZE))
elif [ "$SCHEME" = "xor" ]; then
PG_SIZE=${PG_SIZE:-3}
PG_MINSIZE=${PG_MINSIZE:-2}
PG_DATA_SIZE=$PG_MINSIZE
POOLCFG='"scheme":"xor","parity_chunks":'$((PG_SIZE-PG_MINSIZE))
PG_MINSIZE=${PG_MINSIZE:-3}
PG_DATA_SIZE=$((PG_SIZE-1))
POOLCFG='"scheme":"xor","parity_chunks":1'
else
PG_SIZE=${PG_SIZE:-2}
PG_MINSIZE=${PG_MINSIZE:-2}
@@ -100,13 +100,13 @@ wait_finish_rebalance()
sec=$1
i=0
while [[ $i -lt $sec ]]; do
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"] or .state == ["active", "left_on_dead"]) ] | length) == '$PG_COUNT) && \
break
if [ $i -eq 60 ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
done
}
@@ -117,3 +117,14 @@ check_qemu()
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
fi
}
check_nbd()
{
if [[ -d /sys/module/nbd && ! -e /dev/nbd0 ]]; then
max_part=$(cat /sys/module/nbd/parameters/max_part)
nbds_max=$(cat /sys/module/nbd/parameters/nbds_max)
for i in $(seq 1 $nbds_max); do
mknod /dev/nbd$((i-1)) b 43 $(((i-1)*(max_part+1)))
done
fi
}

View File

@@ -14,8 +14,6 @@ SCHEME=ec ./test_change_pg_count.sh
./test_create_nomaxid.sh
./test_degraded_delete.sh
./test_etcd_fail.sh
./test_failure_domain.sh
@@ -46,5 +44,5 @@ SCHEME=xor ./test_write.sh
./test_write_no_same.sh
./test_heal.sh
SCHEME=ec PG_MINSIZE=2 ./test_heal.sh
PG_SIZE=2 ./test_heal.sh
SCHEME=ec ./test_heal.sh

View File

@@ -15,10 +15,10 @@ done
sleep 2
for i in {1..10}; do
for i in {1..30}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only |\
jq -s -e '([ .[0].items["1"] | map(.osd_set)[][] ] | sort | unique == ["1","2","3","4"])') && \
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == '$PG_COUNT'') && \
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == '$PG_COUNT) && \
break
sleep 1
done
@@ -28,7 +28,7 @@ if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only |\
format_error "FAILED: OSD NOT ADDED INTO DISTRIBUTION"
fi
wait_finish_rebalance 10
wait_finish_rebalance 60
sleep 1
kill -9 $OSD4_PID
@@ -37,7 +37,7 @@ build/src/vitastor-cli --etcd_address $ETCD_URL rm-osd --force 4
sleep 2
for i in {1..10}; do
for i in {1..30}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only |\
jq -s -e '([ .[0].items["1"] | map(.osd_set)[][] ] | sort | unique == ["1","2","3"])') && \
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"] or .state == ["active", "left_on_dead"]) ] | length) == '$PG_COUNT'') && \
@@ -50,6 +50,6 @@ if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only |\
format_error "FAILED: OSD NOT REMOVED FROM DISTRIBUTION"
fi
wait_finish_rebalance 10
wait_finish_rebalance 60
format_green OK

View File

@@ -21,7 +21,7 @@ try_change()
$ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$n'}}'
for i in {1..10}; do
for i in {1..60}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"] or .state == ["active", "has_misplaced"]) ] | length) == '$n'') && \
break

View File

@@ -1,131 +0,0 @@
#!/bin/bash -ex
# Run 3 OSDs
. `dirname $0`/run_3osds.sh
# Write inodes 1 and 2
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=128M -runtime=10
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 \
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 &>/dev/null &
sleep 5
# Stop OSD 1
kill -INT $OSD1_PID
sleep 2
# Remove inode 2
build/src/vitastor-cli rm-data --etcd_address $ETCD_URL --pool 1 --inode 2
# Run 3 more OSDs and move PG to 4,5,6
for i in $(seq 4 6); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
$ETCDCTL put /vitastor/config/osd/1 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/2 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/3 '{"reweight":0}'
# Wait for rebalance to finish
wait_finish_rebalance()
{
local sec=$1
local st=$2
local i=0
while [[ $i -lt $sec ]]; do
if $ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e \
'([ .[] | select(.state == ['$st'] and (.peers | contains([1]) | not) and (.peers | contains([2,3]) | not)) ] | length) == '$PG_COUNT; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
done
}
wait_finish_rebalance 60 '"active","left_on_dead"'
# Stop OSD 2,3
kill -INT $OSD2_PID
kill -INT $OSD3_PID
sleep 2
# Verify that PGs are still active
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active","left_on_dead"]) ] | length == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
# Start OSD 1
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
OSD1_PID=$!
# Verify that inode 2 is removed and inode 1 is in place
wait_repeer_1()
{
local sec=$1
local i=0
while [[ $i -lt $sec ]]; do
if grep -q 'Repeer because of OSD 1' testdata/osd4.log testdata/osd5.log testdata/osd6.log; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "OSD 4/5/6 do not peer with older OSD 1"
fi
done
}
wait_repeer_1 15
wait_finish_rebalance 15 '"active"'
if [ "$SCHEME" = "replicated" ]; then
NOBJ=1024
else
NOBJ=$((1024/(PG_SIZE-1)))
fi
if ! ($ETCDCTL get /vitastor/pg/stats/1/1 --print-value-only | jq -s -e '.[0].object_count == '$NOBJ); then
format_error "FAILED: PG SHOULD CONTAIN EXACTLY 128 MB OF DATA, BUT IT DOESN'T"
fi
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=4096" \
-O raw ./testdata/inode1.bin
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size="$((128*1024*1024)) \
-O raw ./testdata/inode2.bin
if (dd if=/dev/zero bs=4096 count=1 | diff - ./testdata/inode1.bin); then
format_error "FAILED: INODE 1 SEEMS LOST"
fi
if ! (dd if=/dev/zero bs=1M count=128 | diff - ./testdata/inode2.bin); then
format_error "FAILED: INODE 2 SEEMS RESTORED"
fi
format_green OK

View File

@@ -2,7 +2,11 @@
# Kill OSDs while writing
PG_SIZE=3
PG_SIZE=${PG_SIZE:-3}
if [[ "$SCHEME" = "ec" ]]; then
PG_DATA_SIZE=${PG_DATA_SIZE:-2}
PG_MINSIZE=${PG_MINSIZE:-3}
fi
OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh

View File

@@ -30,8 +30,11 @@ try_reweight 4 1
try_reweight 5 1
# Allow rebalance to start
sleep 5
# Wait for the rebalance to finish
wait_finish_rebalance 60
wait_finish_rebalance 300
# Check that PGs never had degraded objects !
# FIXME: In fact, the test doesn't guarantee it because PGs aren't always peered only with full prior OSD sets :-(

View File

@@ -15,11 +15,14 @@ $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicate
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[1,0],"primary":1}}}}'
sleep 2
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active","degraded"]'); then
format_error "Failed to start the PG active+degraded"
fi
for i in {1..30}; do
sleep 1
if ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active","degraded"]'); then
break
elif [[ $i = 30 ]]; then
format_error "Failed to start the PG active+degraded"
fi
done
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \
@@ -27,45 +30,60 @@ fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[1,0],"primary":0}}}}'
sleep 2
if [ "`$ETCDCTL get /vitastor/pg/state/1/1 --print-value-only`" != "" ]; then
format_error "Failed to stop the PG"
fi
for i in {1..30}; do
sleep 1
if [[ "`$ETCDCTL get /vitastor/pg/state/1/1 --print-value-only`" = "" ]]; then
break
elif [[ $i = 30 ]]; then
format_error "Failed to stop the PG"
fi
done
$ETCDCTL put /vitastor/pg/history/1/1 '{"all_peers":[1,2,3]}'
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[4,5],"primary":4}}}}'
sleep 5
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
format_error "Failed to move degraded objects to the clean OSD set"
fi
for i in {1..30}; do
sleep 1
if ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
break
elif [[ $i = 30 ]]; then
format_error "Failed to move degraded objects to the clean OSD set"
fi
done
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[4,5],"primary":0}}}}'
$ETCDCTL put /vitastor/pg/history/1/1 '{"all_peers":[1,2,3]}'
sleep 2
if [ "`$ETCDCTL get /vitastor/pg/state/1/1 --print-value-only`" != "" ]; then
format_error "Failed to stop the PG after degraded recovery"
fi
for i in {1..30}; do
sleep 1
if [ "`$ETCDCTL get /vitastor/pg/state/1/1 --print-value-only`" = "" ]; then
break
elif [[ $i = 30 ]]; then
format_error "Failed to stop the PG after degraded recovery"
fi
done
cp testdata/osd4.log testdata/osd4_pre.log
>testdata/osd4.log
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[4,5],"primary":4}}}}'
sleep 2
if grep -q 'PG 1/1.*is.*has_' testdata/osd4.log; then
format_error "PG has degraded or misplaced objects after a full re-peer following a degraded recovery"
fi
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
format_error "PG not active+clean after a full re-peer following a degraded recovery"
fi
for i in {1..30}; do
sleep 1
if grep -q 'PG 1/1.*is.*has_' testdata/osd4.log; then
if [[ $i = 30 ]]; then
format_error "PG has degraded or misplaced objects after a full re-peer following a degraded recovery"
fi
elif ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
if [[ $i = 30 ]]; then
format_error "PG not active+clean after a full re-peer following a degraded recovery"
fi
else
break
fi
done
format_green OK

View File

@@ -4,11 +4,13 @@ OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh
check_nbd
IMG_SIZE=256
$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"testimg","size":'$((IMG_SIZE*1024*1024))'}'
NBD_DEV=$(sudo build/src/vitastor-nbd map --etcd_address $ETCD_URL --image testimg --logfile ./testdata/nbd.log &)
NBD_DEV=$(sudo build/src/vitastor-nbd map --nbd_timeout 180 --etcd_address $ETCD_URL --image testimg --logfile ./testdata/nbd.log &)
trap "sudo build/src/vitastor-nbd unmap $NBD_DEV"'; kill -9 $(jobs -p)' EXIT

View File

@@ -21,7 +21,7 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
# Kill OSD 2, start OSD 1
kill $OSD2_PID
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL \
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL \
$(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
sleep 2

View File

@@ -14,7 +14,7 @@ for i in $(seq 1 $OSD_COUNT); do
eval OSD${i}_PID=$!
done
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log &
(while true; do node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 || true; done) &>./testdata/mon.log &
MON_PID=$!
sleep 3