forked from vitalif/vitastor
Compare commits
77 Commits
hier-failu
...
qemu-send-
Author | SHA1 | Date | |
---|---|---|---|
c3ec6dd3b9 | |||
c8d61568b5 | |||
84ed3c6395 | |||
a7b57386c0 | |||
9d4ea5f764 | |||
000e4944ec | |||
8426616d89 | |||
1a841344ec | |||
8603b5cb1d | |||
f12b8e45a9 | |||
878ccbb6ea | |||
b14220b4d0 | |||
181d6ba407 | |||
63c2b9832c | |||
10e2e6a7c8 | |||
a598428992 | |||
08a677b684 | |||
7c8fbdad16 | |||
2f9353df60 | |||
57c744f288 | |||
a11ca56fb1 | |||
b84927b340 | |||
83cacba226 | |||
2c8f0bc6d5 | |||
7ae5b0e368 | |||
926be372fd | |||
6222779b52 | |||
a4186e20aa | |||
c74a424930 | |||
32f2c4dd27 | |||
3ad16b9a1a | |||
1c2df841c2 | |||
aa5dacc7a9 | |||
affe8fc270 | |||
4fdc49bdc7 | |||
86b4682975 | |||
bdd48e4cf1 | |||
af8c3411cd | |||
9c405009f3 | |||
f9fbea25a4 | |||
2c9a10d081 | |||
150968070f | |||
cdfc74665b | |||
3f60fecd7c | |||
3b4cf29e65 | |||
eeaba11ebd | |||
aea567cfbd | |||
ce02f47de6 | |||
5fd3208616 | |||
5997b76535 | |||
f1961157f0 | |||
88c1ba0790 | |||
b5bd611683 | |||
fa90b5a4e7 | |||
8d40ad99a6 | |||
3475772b07 | |||
25fcedf6e7 | |||
6ca20aa194 | |||
4bfd994341 | |||
59e959dcbb | |||
a9581f0739 | |||
105a405b0a | |||
d55d7d5326 | |||
0e5d0e02a9 | |||
0439981a66 | |||
6648f6bb6e | |||
281be547eb | |||
0c78dd7178 | |||
3c924397e7 | |||
c3bd26193d | |||
43b77d7619 | |||
a6d846863b | |||
8dc427b43c | |||
bf2112653b | |||
0538a484b3 | |||
97720fa6b4 | |||
e60e352df6 |
@@ -10,6 +10,9 @@ RUN set -e -x; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
ln -s /root/qemu-build/qemu-*/ ./qemu; \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
cd mon; \
|
||||
npm install; \
|
||||
cd ..; \
|
||||
mkdir build; \
|
||||
cd build; \
|
||||
cmake .. -DWITH_ASAN=yes -DWITH_QEMU=yes; \
|
||||
|
@@ -71,7 +71,7 @@ jobs:
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
timeout-minutes: 10
|
||||
run: /root/vitastor/tests/test_add_osd.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
@@ -190,24 +190,6 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_failure_domain:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_failure_domain.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_interrupted_rebalance:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
@@ -280,7 +262,7 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_minsize_1:
|
||||
test_failure_domain:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
@@ -288,115 +270,7 @@ jobs:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_minsize_1.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_move_reappear:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_move_reappear.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify_imm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=ec /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify_ec_imm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_rm.sh
|
||||
run: /root/vitastor/tests/test_failure_domain.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
@@ -442,6 +316,132 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_minsize_1:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_minsize_1.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_move_reappear:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_move_reappear.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_rm.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_snapshot_chain:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_snapshot_chain.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_snapshot_chain_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_snapshot_down:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_snapshot_down.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_snapshot_down_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=ec /root/vitastor/tests/test_snapshot_down.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_splitbrain:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
@@ -460,6 +460,78 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 10
|
||||
run: /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify_imm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 10
|
||||
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 10
|
||||
run: SCHEME=ec /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_rebalance_verify_ec_imm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 10
|
||||
run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_write:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
@@ -550,3 +622,111 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_zero_osd_2:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: ZERO_OSD=2 /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_xor:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=xor /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_pg_size_3:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: PG_SIZE=3 /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=ec /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
|
@@ -9,7 +9,8 @@ for my $line (<>)
|
||||
chomp $line;
|
||||
my $test_name = $1;
|
||||
my $timeout = 3;
|
||||
if ($test_name eq 'test_etcd_fail' || $test_name eq 'test_heal' || $test_name eq 'test_interrupted_rebalance')
|
||||
if ($test_name eq 'test_etcd_fail' || $test_name eq 'test_heal' || $test_name eq 'test_add_osd' ||
|
||||
$test_name eq 'test_interrupted_rebalance' || $test_name eq 'test_rebalance_verify')
|
||||
{
|
||||
$timeout = 10;
|
||||
}
|
||||
|
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.8.9")
|
||||
set(VERSION "0.9.2")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.8.9
|
||||
VERSION ?= v0.9.2
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.8.9
|
||||
image: vitalif/vitastor-csi:v0.9.2
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.8.9
|
||||
image: vitalif/vitastor-csi:v0.9.2
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.8.9"
|
||||
vitastorCSIDriverVersion = "0.9.2"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
58
debian/build-pve-qemu.sh
vendored
Normal file
58
debian/build-pve-qemu.sh
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
exit
|
||||
|
||||
git clone https://git.yourcmc.ru/vitalif/pve-qemu .
|
||||
|
||||
# bookworm
|
||||
|
||||
docker run -it -v `pwd`/pve-qemu:/root/pve-qemu --name pve-qemu-bullseye debian:bullseye bash
|
||||
|
||||
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources
|
||||
echo 'deb [arch=amd64] http://download.proxmox.com/debian/pve bookworm pve-no-subscription' >> /etc/apt/sources.list
|
||||
echo 'deb https://vitastor.io/debian bookworm main' >> /etc/apt/sources.list
|
||||
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf
|
||||
echo 'ru_RU UTF-8' >> /etc/locale.gen
|
||||
echo 'en_US UTF-8' >> /etc/locale.gen
|
||||
apt-get update
|
||||
apt-get install wget ca-certificates
|
||||
wget https://enterprise.proxmox.com/debian/proxmox-release-bookworm.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bookworm.gpg
|
||||
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg
|
||||
apt-get update
|
||||
apt-get install git devscripts equivs wget mc libjemalloc-dev vitastor-client-dev lintian locales
|
||||
mk-build-deps --install ./control
|
||||
|
||||
# bullseye
|
||||
|
||||
docker run -it -v `pwd`/pve-qemu:/root/pve-qemu --name pve-qemu-bullseye debian:bullseye bash
|
||||
|
||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb /deb-src /' >> /etc/apt/sources.list
|
||||
echo 'deb [arch=amd64] http://download.proxmox.com/debian/pve bullseye pve-no-subscription' >> /etc/apt/sources.list
|
||||
echo 'deb https://vitastor.io/debian bullseye main' >> /etc/apt/sources.list
|
||||
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf
|
||||
echo 'ru_RU UTF-8' >> /etc/locale.gen
|
||||
echo 'en_US UTF-8' >> /etc/locale.gen
|
||||
apt-get update
|
||||
apt-get install wget
|
||||
wget https://enterprise.proxmox.com/debian/proxmox-release-bullseye.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bullseye.gpg
|
||||
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg
|
||||
apt-get update
|
||||
apt-get install git devscripts equivs wget mc libjemalloc-dev vitastor-client-dev lintian locales
|
||||
mk-build-deps --install ./control
|
||||
|
||||
# buster
|
||||
|
||||
docker run -it -v `pwd`/pve-qemu:/root/pve-qemu --name pve-qemu-buster debian:buster bash
|
||||
|
||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb /deb-src /' >> /etc/apt/sources.list
|
||||
echo 'deb [arch=amd64] http://download.proxmox.com/debian/pve buster pve-no-subscription' >> /etc/apt/sources.list
|
||||
echo 'deb https://vitastor.io/debian buster main' >> /etc/apt/sources.list
|
||||
echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list
|
||||
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf
|
||||
echo 'ru_RU UTF-8' >> /etc/locale.gen
|
||||
echo 'en_US UTF-8' >> /etc/locale.gen
|
||||
apt-get update
|
||||
apt-get install wget ca-certificates
|
||||
wget http://download.proxmox.com/debian/proxmox-ve-release-6.x.gpg -O /etc/apt/trusted.gpg.d/proxmox-ve-release-6.x.gpg
|
||||
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg
|
||||
apt-get update
|
||||
apt-get install git devscripts equivs wget mc libjemalloc-dev vitastor-client-dev lintian locales
|
||||
mk-build-deps --install ./control
|
7
debian/build-vitastor-bookworm.sh
vendored
Executable file
7
debian/build-vitastor-bookworm.sh
vendored
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (0.8.9-1) unstable; urgency=medium
|
||||
vitastor (0.9.2-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (0.8.9-1) unstable; urgency=medium
|
||||
vitastor (0.9.2-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
27
debian/patched-qemu.Dockerfile
vendored
27
debian/patched-qemu.Dockerfile
vendored
@@ -1,4 +1,4 @@
|
||||
# Build patched QEMU for Debian Buster or Bullseye/Sid inside a container
|
||||
# Build patched QEMU for Debian inside a container
|
||||
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/patched-qemu.Dockerfile .
|
||||
|
||||
ARG REL=
|
||||
@@ -15,17 +15,19 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
|
||||
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
|
||||
fi; \
|
||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
|
||||
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
|
||||
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN apt-get -y build-dep qemu
|
||||
# To build a custom version
|
||||
#RUN cp /root/packages/qemu-orig/* /root
|
||||
RUN apt-get --download-only source qemu
|
||||
|
||||
ADD patches/qemu-5.0-vitastor.patch patches/qemu-5.1-vitastor.patch patches/qemu-6.1-vitastor.patch src/qemu_driver.c /root/vitastor/patches/
|
||||
ADD patches /root/vitastor/patches
|
||||
ADD src/qemu_driver.c /root/vitastor/src/qemu_driver.c
|
||||
RUN set -e; \
|
||||
apt-get install -y wget; \
|
||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
|
||||
@@ -37,23 +39,14 @@ RUN set -e; \
|
||||
rm -rf /root/packages/qemu-$REL/*; \
|
||||
cd /root/packages/qemu-$REL; \
|
||||
dpkg-source -x /root/qemu*.dsc; \
|
||||
if ls -d /root/packages/qemu-$REL/qemu-5.0*; then \
|
||||
D=$(ls -d /root/packages/qemu-$REL/qemu-5.0*); \
|
||||
cp /root/vitastor/patches/qemu-5.0-vitastor.patch $D/debian/patches; \
|
||||
echo qemu-5.0-vitastor.patch >> $D/debian/patches/series; \
|
||||
elif ls /root/packages/qemu-$REL/qemu-6.1*; then \
|
||||
D=$(ls -d /root/packages/qemu-$REL/qemu-6.1*); \
|
||||
cp /root/vitastor/patches/qemu-6.1-vitastor.patch $D/debian/patches; \
|
||||
echo qemu-6.1-vitastor.patch >> $D/debian/patches/series; \
|
||||
else \
|
||||
cp /root/vitastor/patches/qemu-5.1-vitastor.patch /root/packages/qemu-$REL/qemu-*/debian/patches; \
|
||||
P=`ls -d /root/packages/qemu-$REL/qemu-*/debian/patches`; \
|
||||
echo qemu-5.1-vitastor.patch >> $P/series; \
|
||||
fi; \
|
||||
QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*(\d+\.\d+).*!$1!'); \
|
||||
D=$(ls -d qemu*/); \
|
||||
cp /root/vitastor/patches/qemu-$QEMU_VER-vitastor.patch ./qemu-*/debian/patches; \
|
||||
echo qemu-$QEMU_VER-vitastor.patch >> $D/debian/patches/series; \
|
||||
cd /root/packages/qemu-$REL/qemu-*/; \
|
||||
quilt push -a; \
|
||||
quilt add block/vitastor.c; \
|
||||
cp /root/vitastor/patches/qemu_driver.c block/vitastor.c; \
|
||||
cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
|
||||
quilt refresh; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
|
||||
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
|
||||
|
13
debian/vitastor.Dockerfile
vendored
13
debian/vitastor.Dockerfile
vendored
@@ -1,4 +1,4 @@
|
||||
# Build Vitastor packages for Debian Buster or Bullseye/Sid inside a container
|
||||
# Build Vitastor packages for Debian inside a container
|
||||
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
|
||||
|
||||
ARG REL=
|
||||
@@ -15,11 +15,12 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
|
||||
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
|
||||
fi; \
|
||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
|
||||
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
|
||||
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get -y install fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN apt-get -y build-dep fio
|
||||
RUN apt-get --download-only source fio
|
||||
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev
|
||||
@@ -34,8 +35,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.8.9; \
|
||||
cd vitastor-0.8.9; \
|
||||
cp -r /root/vitastor vitastor-0.9.2; \
|
||||
cd vitastor-0.9.2; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -48,8 +49,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.9.orig.tar.xz vitastor-0.8.9; \
|
||||
cd vitastor-0.8.9; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.2.orig.tar.xz vitastor-0.9.2; \
|
||||
cd vitastor-0.9.2; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -21,7 +21,7 @@ Configuration parameters can be set in 3 places:
|
||||
mon, fio and QEMU options, OpenStack/Proxmox/etc configuration. The latter
|
||||
doesn't allow to set all variables directly, but it allows to override the
|
||||
configuration file and set everything you need inside it.
|
||||
- OSD superblocks created by [vitastor-disk](../usage/disk.en.md) contain
|
||||
- OSD superblocks created by [vitastor-disk](usage/disk.en.md) contain
|
||||
primarily disk layout parameters of specific OSDs. In fact, these parameters
|
||||
are automatically passed into the command line of vitastor-osd process, so
|
||||
they have the same "status" as command-line parameters.
|
||||
|
@@ -23,7 +23,7 @@
|
||||
монитора, опциях fio и QEMU, настроек OpenStack, Proxmox и т.п. Последние,
|
||||
как правило, не включают полный набор параметров напрямую, но позволяют
|
||||
определить путь к файлу конфигурации и задать любые параметры в нём.
|
||||
- В суперблоке OSD, записываемом [vitastor-disk](../usage/disk.ru.md) - параметры,
|
||||
- В суперблоке OSD, записываемом [vitastor-disk](usage/disk.ru.md) - параметры,
|
||||
связанные с дисковым форматом и с этим конкретным OSD. На самом деле,
|
||||
при запуске OSD эти параметры автоматически передаются в командную строку
|
||||
процесса vitastor-osd, то есть по "статусу" они эквивалентны параметрам
|
||||
|
@@ -25,11 +25,16 @@ running if required parameters are specified.
|
||||
## etcd_address
|
||||
|
||||
- Type: string or array of strings
|
||||
- Can be changed online: yes
|
||||
|
||||
etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
|
||||
specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
Note that https is not supported for etcd connections yet.
|
||||
|
||||
etcd connection endpoints can be changed online by updating global
|
||||
configuration in etcd itself - this allows to switch the cluster to new
|
||||
etcd addresses without downtime.
|
||||
|
||||
## etcd_prefix
|
||||
|
||||
- Type: string
|
||||
@@ -42,5 +47,6 @@ example, use a single etcd cluster for multiple Vitastor clusters.
|
||||
|
||||
- Type: integer
|
||||
- Default: 0
|
||||
- Can be changed online: yes
|
||||
|
||||
Log level. Raise if you want more verbose output.
|
||||
|
@@ -24,10 +24,14 @@
|
||||
## etcd_address
|
||||
|
||||
- Тип: строка или массив строк
|
||||
- Можно менять на лету: да
|
||||
|
||||
Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
|
||||
или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
|
||||
Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
|
||||
самом etcd - это позволяет переключить кластер на новые etcd без остановки.
|
||||
|
||||
## etcd_prefix
|
||||
|
||||
- Тип: строка
|
||||
@@ -41,5 +45,6 @@
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 0
|
||||
- Можно менять на лету: да
|
||||
|
||||
Уровень логгирования. Повысьте, если хотите более подробный вывод.
|
||||
|
@@ -153,6 +153,7 @@ operations.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Interval before attempting to reconnect to an unavailable OSD.
|
||||
|
||||
@@ -161,6 +162,7 @@ Interval before attempting to reconnect to an unavailable OSD.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for OSD connection attempts.
|
||||
|
||||
@@ -169,6 +171,7 @@ Timeout for OSD connection attempts.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
OSD connection inactivity time after which clients and other OSDs send
|
||||
keepalive requests to check state of the connection.
|
||||
@@ -178,6 +181,7 @@ keepalive requests to check state of the connection.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
|
||||
within this time, the connection to it is dropped and a reconnection attempt
|
||||
@@ -188,6 +192,7 @@ is scheduled.
|
||||
- Type: milliseconds
|
||||
- Default: 500
|
||||
- Minimum: 50
|
||||
- Can be changed online: yes
|
||||
|
||||
OSDs respond to clients with a special error code when they receive I/O
|
||||
requests for a PG that's not synchronized and started. This parameter sets
|
||||
@@ -197,6 +202,7 @@ the time for the clients to wait before re-attempting such I/O requests.
|
||||
|
||||
- Type: integer
|
||||
- Default: 5
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum number of attempts for etcd requests which can't be retried
|
||||
indefinitely.
|
||||
@@ -205,6 +211,7 @@ indefinitely.
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 1000
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||
|
||||
@@ -212,6 +219,7 @@ Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 5000
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for etcd requests which are allowed to wait for some time.
|
||||
|
||||
@@ -219,6 +227,7 @@ Timeout for etcd requests which are allowed to wait for some time.
|
||||
|
||||
- Type: seconds
|
||||
- Default: max(30, etcd_report_interval*2)
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
|
||||
etcd_report_interval to guarantee that keepalive actually works.
|
||||
@@ -227,6 +236,7 @@ etcd_report_interval to guarantee that keepalive actually works.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 30
|
||||
- Can be changed online: yes
|
||||
|
||||
etcd websocket ping interval required to keep the connection alive and
|
||||
detect disconnections quickly.
|
||||
@@ -235,6 +245,7 @@ detect disconnections quickly.
|
||||
|
||||
- Type: integer
|
||||
- Default: 33554432
|
||||
- Can be changed online: yes
|
||||
|
||||
Without immediate_commit=all this parameter sets the limit of "dirty"
|
||||
(not committed by fsync) data allowed by the client before forcing an
|
||||
|
@@ -161,6 +161,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Время ожидания перед повторной попыткой соединиться с недоступным OSD.
|
||||
|
||||
@@ -169,6 +170,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время ожидания попытки соединения с OSD.
|
||||
|
||||
@@ -177,6 +179,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Время неактивности соединения с OSD, после которого клиенты или другие OSD
|
||||
посылают запрос проверки состояния соединения.
|
||||
@@ -186,6 +189,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время ожидания ответа на запрос проверки состояния соединения.
|
||||
Если OSD не отвечает за это время, соединение отключается и производится
|
||||
@@ -196,6 +200,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 500
|
||||
- Минимальное значение: 50
|
||||
- Можно менять на лету: да
|
||||
|
||||
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
||||
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
|
||||
@@ -207,6 +212,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 5
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число попыток выполнения запросов к etcd для тех запросов,
|
||||
которые нельзя повторять бесконечно.
|
||||
@@ -215,6 +221,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 1000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время выполнения запросов к etcd, которые должны завершаться
|
||||
быстро, таких, как обновление резервации (lease).
|
||||
@@ -223,6 +230,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 5000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время выполнения запросов к etcd, для которых не обязательно
|
||||
гарантировать быстрое выполнение.
|
||||
@@ -231,6 +239,7 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: max(30, etcd_report_interval*2)
|
||||
- Можно менять на лету: да
|
||||
|
||||
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
|
||||
etcd_report_interval, чтобы keepalive гарантированно работал.
|
||||
@@ -239,6 +248,7 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 30
|
||||
- Можно менять на лету: да
|
||||
|
||||
Интервал проверки живости вебсокет-подключений к etcd.
|
||||
|
||||
@@ -246,6 +256,7 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 33554432
|
||||
- Можно менять на лету: да
|
||||
|
||||
При работе без immediate_commit=all - это лимит объёма "грязных" (не
|
||||
зафиксированных fsync-ом) данных, при достижении которого клиент будет
|
||||
|
@@ -7,7 +7,8 @@
|
||||
# Runtime OSD Parameters
|
||||
|
||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||
initialization and can be changed with an OSD restart.
|
||||
initialization and can be changed - either with an OSD restart or, for some of
|
||||
them, even without restarting by updating configuration in etcd.
|
||||
|
||||
- [etcd_report_interval](#etcd_report_interval)
|
||||
- [run_primary](#run_primary)
|
||||
@@ -38,6 +39,14 @@ initialization and can be changed with an OSD restart.
|
||||
- [throttle_target_parallelism](#throttle_target_parallelism)
|
||||
- [throttle_threshold_us](#throttle_threshold_us)
|
||||
- [osd_memlock](#osd_memlock)
|
||||
- [auto_scrub](#auto_scrub)
|
||||
- [no_scrub](#no_scrub)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [scrub_queue_depth](#scrub_queue_depth)
|
||||
- [scrub_sleep](#scrub_sleep)
|
||||
- [scrub_list_limit](#scrub_list_limit)
|
||||
- [scrub_find_best](#scrub_find_best)
|
||||
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
@@ -91,6 +100,7 @@ OSD by hand.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Can be changed online: yes
|
||||
|
||||
Time interval at which automatic fsyncs/flushes are issued by each OSD when
|
||||
the immediate_commit mode if disabled. fsyncs are required because without
|
||||
@@ -103,6 +113,7 @@ issue fsyncs at all.
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
- Can be changed online: yes
|
||||
|
||||
Same as autosync_interval, but sets the maximum number of uncommitted write
|
||||
operations before issuing an fsync operation internally.
|
||||
@@ -111,6 +122,7 @@ operations before issuing an fsync operation internally.
|
||||
|
||||
- Type: integer
|
||||
- Default: 4
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
@@ -120,6 +132,7 @@ and rebalancing, but it's planned to implement more.
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
@@ -130,6 +143,7 @@ Degraded PGs are anyway scanned first.
|
||||
|
||||
- Type: integer
|
||||
- Default: 16
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum number of recovery operations before issuing an additional fsync.
|
||||
|
||||
@@ -145,6 +159,7 @@ the underlying device. This may be useful for recovery purposes.
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Disable automatic background recovery of objects. Note that it doesn't
|
||||
affect implicit recovery of objects happening during writes - a write is
|
||||
@@ -154,6 +169,7 @@ always made to a full set of at least pg_minsize OSDs.
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Disable background movement of data between different OSDs. Disabling it
|
||||
means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||
@@ -162,6 +178,7 @@ means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 3
|
||||
- Can be changed online: yes
|
||||
|
||||
Time interval at which OSDs print simple human-readable operation
|
||||
statistics on stdout.
|
||||
@@ -170,6 +187,7 @@ statistics on stdout.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 10
|
||||
- Can be changed online: yes
|
||||
|
||||
Time interval at which OSDs dump slow or stuck operations on stdout, if
|
||||
they're any. Also it's the time after which an operation is considered
|
||||
@@ -179,6 +197,7 @@ they're any. Also it's the time after which an operation is considered
|
||||
|
||||
- Type: seconds
|
||||
- Default: 60
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of seconds after which a deleted inode is removed from OSD statistics.
|
||||
|
||||
@@ -186,6 +205,7 @@ Number of seconds after which a deleted inode is removed from OSD statistics.
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
- Can be changed online: yes
|
||||
|
||||
Parallel client write operation limit per one OSD. Operations that exceed
|
||||
this limit are pushed to a temporary queue instead of being executed
|
||||
@@ -195,6 +215,7 @@ immediately.
|
||||
|
||||
- Type: integer
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Flusher is a micro-thread that moves data from the journal to the data
|
||||
area of the device. Their number is auto-tuned between minimum and maximum.
|
||||
@@ -204,6 +225,7 @@ Minimum number is set by this parameter.
|
||||
|
||||
- Type: integer
|
||||
- Default: 256
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum number of journal flushers (see above min_flusher_count).
|
||||
|
||||
@@ -260,6 +282,7 @@ Most (99%) other SSDs don't need this option.
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Enable soft throttling of small journaled writes. Useful for hybrid OSDs
|
||||
with fast journal/metadata devices and slow data devices. The idea is that
|
||||
@@ -277,6 +300,7 @@ fills up.
|
||||
|
||||
- Type: integer
|
||||
- Default: 100
|
||||
- Can be changed online: yes
|
||||
|
||||
Target maximum number of throttled operations per second under the condition
|
||||
of full journal. Set it to approximate random write iops of your data devices
|
||||
@@ -286,6 +310,7 @@ of full journal. Set it to approximate random write iops of your data devices
|
||||
|
||||
- Type: integer
|
||||
- Default: 100
|
||||
- Can be changed online: yes
|
||||
|
||||
Target maximum bandwidth in MB/s of throttled operations per second under
|
||||
the condition of full journal. Set it to approximate linear write
|
||||
@@ -295,6 +320,7 @@ performance of your data devices (HDDs).
|
||||
|
||||
- Type: integer
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Target maximum parallelism of throttled operations under the condition of
|
||||
full journal. Set it to approximate internal parallelism of your data
|
||||
@@ -304,6 +330,7 @@ devices (1 for HDDs, 4-8 for SSDs).
|
||||
|
||||
- Type: microseconds
|
||||
- Default: 50
|
||||
- Can be changed online: yes
|
||||
|
||||
Minimal computed delay to be applied to throttled operations. Usually
|
||||
doesn't need to be changed.
|
||||
@@ -313,4 +340,103 @@ doesn't need to be changed.
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
Lock all OSD memory to prevent it from being unloaded into swap with mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||
Lock all OSD memory to prevent it from being unloaded into swap with
|
||||
mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||
|
||||
## auto_scrub
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Data scrubbing is the process of background verification of copies to find
|
||||
and repair corrupted blocks. It's not run automatically by default since
|
||||
it's a new feature. Set this parameter to true to enable automatic scrubs.
|
||||
|
||||
This parameter makes OSDs automatically schedule data scrubbing of clean PGs
|
||||
every `scrub_interval` (see below). You can also start/schedule scrubbing
|
||||
manually by setting `next_scrub` JSON key to the desired UNIX time of the
|
||||
next scrub in `/pg/history/...` values in etcd.
|
||||
|
||||
## no_scrub
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Temporarily disable scrubbing and stop running scrubs.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Type: string
|
||||
- Default: 30d
|
||||
- Can be changed online: yes
|
||||
|
||||
Default automatic scrubbing interval for all pools. Numbers without suffix
|
||||
are treated as seconds, possible unit suffixes include 's' (seconds),
|
||||
'm' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
|
||||
|
||||
## scrub_queue_depth
|
||||
|
||||
- Type: integer
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of parallel scrubbing operations per one OSD.
|
||||
|
||||
## scrub_sleep
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 0
|
||||
- Can be changed online: yes
|
||||
|
||||
Additional interval between two consecutive scrubbing operations on one OSD.
|
||||
Can be used to slow down scrubbing if it affects user load too much.
|
||||
|
||||
## scrub_list_limit
|
||||
|
||||
- Type: integer
|
||||
- Default: 1000
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
|
||||
## scrub_find_best
|
||||
|
||||
- Type: boolean
|
||||
- Default: true
|
||||
- Can be changed online: yes
|
||||
|
||||
Find and automatically restore best versions of objects with unmatched
|
||||
copies. In replicated setups, the best version is the version with most
|
||||
matching replicas. In EC setups, the best version is the subset of data
|
||||
and parity chunks without mismatches.
|
||||
|
||||
The hypothetical situation where you might want to disable it is when
|
||||
you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
|
||||
corrupt an object in the same way (for example, zero it out) and only
|
||||
1 HDD will remain good. In this case disabling scrub_find_best may help
|
||||
you to recover the data! See also scrub_ec_max_bruteforce below.
|
||||
|
||||
## scrub_ec_max_bruteforce
|
||||
|
||||
- Type: integer
|
||||
- Default: 100
|
||||
- Can be changed online: yes
|
||||
|
||||
Vitastor can locate corrupted chunks in EC setups with more than 1 parity
|
||||
chunk by brute-forcing all possible error locations. This configuration
|
||||
value limits the maximum number of checked combinations. You can try to
|
||||
increase it if you have EC N+K setup with N and K large enough for
|
||||
combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
|
||||
than the default 100.
|
||||
|
||||
If there are too many possible combinations or if multiple combinations give
|
||||
correct results then objects are marked inconsistent and aren't recovered
|
||||
automatically.
|
||||
|
||||
In replicated setups bruteforcing isn't needed, Vitastor just assumes that
|
||||
the variant with most available equal copies is correct. For example, if
|
||||
you have 3 replicas and 1 of them differs, this one is considered to be
|
||||
corrupted. But if there is no "best" version with more copies than all
|
||||
others have then the object is also marked as inconsistent.
|
||||
|
@@ -8,7 +8,8 @@
|
||||
|
||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
||||
момент с перезапуском OSD.
|
||||
момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
|
||||
изменения конфигурации в etcd.
|
||||
|
||||
- [etcd_report_interval](#etcd_report_interval)
|
||||
- [run_primary](#run_primary)
|
||||
@@ -39,6 +40,14 @@
|
||||
- [throttle_target_parallelism](#throttle_target_parallelism)
|
||||
- [throttle_threshold_us](#throttle_threshold_us)
|
||||
- [osd_memlock](#osd_memlock)
|
||||
- [auto_scrub](#auto_scrub)
|
||||
- [no_scrub](#no_scrub)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [scrub_queue_depth](#scrub_queue_depth)
|
||||
- [scrub_sleep](#scrub_sleep)
|
||||
- [scrub_list_limit](#scrub_list_limit)
|
||||
- [scrub_find_best](#scrub_find_best)
|
||||
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
@@ -93,6 +102,7 @@ RUNNING), подходящий под заданную маску. Также н
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временной интервал отправки автоматических fsync-ов (операций очистки кэша)
|
||||
каждым OSD для случая, когда режим immediate_commit отключён. fsync-и нужны
|
||||
@@ -105,6 +115,7 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
- Можно менять на лету: да
|
||||
|
||||
Аналогично autosync_interval, но задаёт не временной интервал, а
|
||||
максимальное количество незафиксированных операций записи перед
|
||||
@@ -114,6 +125,7 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 4
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число операций восстановления на одном первичном OSD в любой
|
||||
момент времени. На данный момент единственный параметр, который можно менять
|
||||
@@ -124,6 +136,7 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
- Можно менять на лету: да
|
||||
|
||||
Число операций восстановления перед переключением на восстановление другой PG.
|
||||
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||
@@ -135,6 +148,7 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 16
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число операций восстановления перед дополнительным fsync.
|
||||
|
||||
@@ -150,6 +164,7 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Отключить автоматическое фоновое восстановление объектов. Обратите внимание,
|
||||
что эта опция не отключает восстановление объектов, происходящее при
|
||||
@@ -160,6 +175,7 @@ OSD.
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Отключить фоновое перемещение объектов между разными OSD. Отключение
|
||||
означает, что PG, находящиеся в состоянии `has_misplaced`, будут оставлены
|
||||
@@ -169,6 +185,7 @@ OSD.
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 3
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временной интервал, с которым OSD печатают простую человекочитаемую
|
||||
статистику выполнения операций в стандартный вывод.
|
||||
@@ -177,6 +194,7 @@ OSD.
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 10
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временной интервал, с которым OSD выводят в стандартный вывод список
|
||||
медленных или зависших операций, если таковые имеются. Также время, при
|
||||
@@ -186,6 +204,7 @@ OSD.
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 60
|
||||
- Можно менять на лету: да
|
||||
|
||||
Число секунд, через которое удалённые инод удаляется и из статистики OSD.
|
||||
|
||||
@@ -193,6 +212,7 @@ OSD.
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число одновременных клиентских операций записи на один OSD.
|
||||
Операции, превышающие этот лимит, не исполняются сразу, а сохраняются во
|
||||
@@ -202,6 +222,7 @@ OSD.
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Flusher - это микро-поток (корутина), которая копирует данные из журнала в
|
||||
основную область устройства данных. Их число настраивается динамически между
|
||||
@@ -211,6 +232,7 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 256
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число микро-потоков очистки журнала (см. выше min_flusher_count).
|
||||
|
||||
@@ -270,6 +292,7 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Разрешить мягкое ограничение скорости журналируемой записи. Полезно для
|
||||
гибридных OSD с быстрыми устройствами метаданных и медленными устройствами
|
||||
@@ -288,6 +311,7 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 100
|
||||
- Можно менять на лету: да
|
||||
|
||||
Расчётное максимальное число ограничиваемых операций в секунду при условии
|
||||
отсутствия свободного места в журнале. Устанавливайте приблизительно равным
|
||||
@@ -298,6 +322,7 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 100
|
||||
- Можно менять на лету: да
|
||||
|
||||
Расчётный максимальный размер в МБ/с ограничиваемых операций в секунду при
|
||||
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||
@@ -308,6 +333,7 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Расчётный максимальный параллелизм ограничиваемых операций в секунду при
|
||||
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||
@@ -318,6 +344,7 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: микросекунды
|
||||
- Значение по умолчанию: 50
|
||||
- Можно менять на лету: да
|
||||
|
||||
Минимальная применимая к ограничиваемым операциям задержка. Обычно не
|
||||
требует изменений.
|
||||
@@ -327,4 +354,113 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку в пространство подкачки. Требует достаточного значения ulimit -l (лимита заблокированной памяти).
|
||||
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
|
||||
в пространство подкачки. Требует достаточного значения ulimit -l (лимита
|
||||
заблокированной памяти).
|
||||
|
||||
## auto_scrub
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
|
||||
находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
|
||||
запускаются автоматически, так как являются новой функцией. Чтобы включить
|
||||
автоматическое планирование скрабов, установите данный параметр в true.
|
||||
|
||||
Включённый параметр заставляет OSD автоматически планировать фоновую
|
||||
проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
|
||||
запустить или запланировать проверку вручную, установив значение ключа JSON
|
||||
`next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
|
||||
желаемой проверки.
|
||||
|
||||
## no_scrub
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временно отключить и остановить запущенные скрабы.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Тип: строка
|
||||
- Значение по умолчанию: 30d
|
||||
- Можно менять на лету: да
|
||||
|
||||
Интервал автоматической фоновой проверки по умолчанию для всех пулов.
|
||||
Значения без указанной единицы измерения считаются в секундах, допустимые
|
||||
символы единиц измерения в конце: 's' (секунды),
|
||||
'm' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
|
||||
|
||||
## scrub_queue_depth
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Число параллельных операций фоновой проверки на один OSD.
|
||||
|
||||
## scrub_sleep
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 0
|
||||
- Можно менять на лету: да
|
||||
|
||||
Дополнительный интервал ожидания после фоновой проверки каждого объекта на
|
||||
одном OSD. Может использоваться для замедления скраба, если он слишком
|
||||
сильно влияет на пользовательскую нагрузку.
|
||||
|
||||
## scrub_list_limit
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Размер загружаемых за одну операцию списков объектов в процессе фоновой
|
||||
проверки.
|
||||
|
||||
## scrub_find_best
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: true
|
||||
- Можно менять на лету: да
|
||||
|
||||
Находить и автоматически восстанавливать "лучшие версии" объектов с
|
||||
несовпадающими копиями/частями. При использовании репликации "лучшая"
|
||||
версия - версия, доступная в большем числе экземпляров, чем другие. При
|
||||
использовании кодов коррекции ошибок "лучшая" версия - это подмножество
|
||||
частей данных и чётности, полностью соответствующих друг другу.
|
||||
|
||||
Гипотетическая ситуация, в которой вы можете захотеть отключить этот
|
||||
поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
|
||||
незаметно и одинаково повредить данные одного и того же объекта, например,
|
||||
занулив его, и только 1 диск останется неповреждённым. В этой ситуации
|
||||
отключение этого параметра поможет вам восстановить данные! Смотрите также
|
||||
описание следующего параметра - scrub_ec_max_bruteforce.
|
||||
|
||||
## scrub_ec_max_bruteforce
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 100
|
||||
- Можно менять на лету: да
|
||||
|
||||
Vitastor старается определить повреждённые части объектов при использовании
|
||||
EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
|
||||
всех возможных комбинаций ошибочных частей. Данное значение конфигурации
|
||||
ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
|
||||
его, если используете схему кодирования EC N+K с N и K, достаточно большими
|
||||
для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
|
||||
было больше, чем стандартное значение 100.
|
||||
|
||||
Если возможных комбинаций слишком много или если корректная комбинаций не
|
||||
определяется однозначно, объекты помечаются неконсистентными (inconsistent)
|
||||
и не восстанавливаются автоматически.
|
||||
|
||||
При использовании репликации перебор не нужен, Vitastor просто предполагает,
|
||||
что вариант объекта с наибольшим количеством одинаковых копий корректен.
|
||||
Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
|
||||
считается некорректной. Однако, если "лучшую" версию с числом доступных
|
||||
копий большим, чем у всех других версий, найти невозможно, то объект тоже
|
||||
маркируется неконсистентным.
|
||||
|
@@ -40,6 +40,7 @@ Parameters:
|
||||
- [root_node](#root_node)
|
||||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -272,6 +273,13 @@ Specifies OSD tags to prefer putting primary OSDs in this pool to.
|
||||
Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one
|
||||
of the OSDs containing a data chunk for a PG.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Type: time interval (number + unit s/m/h/d/M/y)
|
||||
|
||||
Automatic scrubbing interval for this pool. Overrides
|
||||
[global scrub_interval setting](osd.en.md#scrub_interval).
|
||||
|
||||
# Examples
|
||||
|
||||
## Replicated pool
|
||||
|
@@ -39,6 +39,7 @@
|
||||
- [root_node](#root_node)
|
||||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
|
||||
Примеры:
|
||||
|
||||
@@ -276,6 +277,13 @@ PG в Vitastor эферемерны, то есть вы можете менят
|
||||
для PG этого пула. Имейте в виду, что для EC-пулов Vitastor также всегда
|
||||
предпочитает помещать первичный OSD на один из OSD с данными, а не с чётностью.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Тип: временной интервал (число + единица измерения s/m/h/d/M/y)
|
||||
|
||||
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
|
||||
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
|
||||
|
||||
# Примеры
|
||||
|
||||
## Реплицированный пул
|
||||
|
@@ -11,13 +11,21 @@
|
||||
- name: etcd_address
|
||||
type: string or array of strings
|
||||
type_ru: строка или массив строк
|
||||
online: true
|
||||
info: |
|
||||
etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
|
||||
specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
Note that https is not supported for etcd connections yet.
|
||||
|
||||
etcd connection endpoints can be changed online by updating global
|
||||
configuration in etcd itself - this allows to switch the cluster to new
|
||||
etcd addresses without downtime.
|
||||
info_ru: |
|
||||
Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
|
||||
или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
|
||||
Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
|
||||
самом etcd - это позволяет переключить кластер на новые etcd без остановки.
|
||||
- name: etcd_prefix
|
||||
type: string
|
||||
default: "/vitastor"
|
||||
@@ -31,5 +39,6 @@
|
||||
- name: log_level
|
||||
type: int
|
||||
default: 0
|
||||
online: true
|
||||
info: Log level. Raise if you want more verbose output.
|
||||
info_ru: Уровень логгирования. Повысьте, если хотите более подробный вывод.
|
||||
|
145
docs/config/src/include.js
Executable file
145
docs/config/src/include.js
Executable file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/nodejs
|
||||
|
||||
const fsp = require('fs').promises;
|
||||
|
||||
run(process.argv).catch(console.error);
|
||||
|
||||
async function run(argv)
|
||||
{
|
||||
if (argv.length < 3)
|
||||
{
|
||||
console.log('Markdown preprocessor\nUSAGE: ./include.js file.md');
|
||||
return;
|
||||
}
|
||||
const index_file = await fsp.realpath(argv[2]);
|
||||
const re = /(\{\{[\s\S]*?\}\}|\[[^\]]+\]\([^\)]+\)|(?:^|\n)#[^\n]+)/;
|
||||
let text = await fsp.readFile(index_file, { encoding: 'utf-8' });
|
||||
text = text.split(re);
|
||||
let included = {};
|
||||
let heading = 0, heading_name = '', m;
|
||||
for (let i = 0; i < text.length; i++)
|
||||
{
|
||||
if (text[i].substr(0, 2) == '{{')
|
||||
{
|
||||
// Inclusion
|
||||
let incfile = text[i].substr(2, text[i].length-4);
|
||||
let section = null;
|
||||
let indent = heading;
|
||||
incfile = incfile.replace(/\s*\|\s*indent\s*=\s*(-?\d+)\s*$/, (m, m1) => { indent = parseInt(m1); return ''; });
|
||||
incfile = incfile.replace(/\s*#\s*([^#]+)$/, (m, m1) => { section = m1; return ''; });
|
||||
let inc_heading = section;
|
||||
incfile = rel2abs(index_file, incfile);
|
||||
let inc = await fsp.readFile(incfile, { encoding: 'utf-8' });
|
||||
inc = inc.trim().replace(/^[\s\S]+?\n#/, '#'); // remove until the first header
|
||||
inc = inc.split(re);
|
||||
const indent_str = new Array(indent+1).join('#');
|
||||
let section_start = -1, section_end = -1;
|
||||
for (let j = 0; j < inc.length; j++)
|
||||
{
|
||||
if ((m = /^(\n?)(#+\s*)([\s\S]+)$/.exec(inc[j])))
|
||||
{
|
||||
if (!inc_heading)
|
||||
{
|
||||
inc_heading = m[3].trim();
|
||||
}
|
||||
if (section)
|
||||
{
|
||||
if (m[3].trim() == section)
|
||||
section_start = j;
|
||||
else if (section_start >= 0)
|
||||
{
|
||||
section_end = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
inc[j] = m[1] + indent_str + m[2] + m[3];
|
||||
}
|
||||
else if ((m = /^(\[[^\]]+\]\()([^\)]+)(\))$/.exec(inc[j])) && !/^https?:(\/\/)|^#/.exec(m[2]))
|
||||
{
|
||||
const abs_m2 = rel2abs(incfile, m[2]);
|
||||
const rel_m = abs2rel(__filename, abs_m2);
|
||||
if (rel_m.substr(0, 9) == '../../../') // outside docs
|
||||
inc[j] = m[1] + 'https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/'+rel2abs('docs/config/src/include.js', rel_m) + m[3];
|
||||
else
|
||||
inc[j] = m[1] + abs_m2 + m[3];
|
||||
}
|
||||
}
|
||||
if (section)
|
||||
{
|
||||
inc = section_start >= 0 ? inc.slice(section_start, section_end < 0 ? inc.length : section_end) : [];
|
||||
}
|
||||
if (inc.length)
|
||||
{
|
||||
if (!inc_heading)
|
||||
inc_heading = heading_name||'';
|
||||
included[incfile+(section ? '#'+section : '')] = '#'+inc_heading.toLowerCase().replace(/\P{L}+/ug, '-').replace(/^-|-$/g, '');
|
||||
inc[0] = inc[0].replace(/^\s+/, '');
|
||||
inc[inc.length-1] = inc[inc.length-1].replace(/\s+$/, '');
|
||||
}
|
||||
text.splice(i, 1, ...inc);
|
||||
i = i + inc.length - 1;
|
||||
}
|
||||
else if ((m = /^\n?(#+)\s*([\s\S]+)$/.exec(text[i])))
|
||||
{
|
||||
// Heading
|
||||
heading = m[1].length;
|
||||
heading_name = m[2].trim();
|
||||
}
|
||||
}
|
||||
for (let i = 0; i < text.length; i++)
|
||||
{
|
||||
if ((m = /^(\[[^\]]+\]\()([^\)]+)(\))$/.exec(text[i])) && !/^https?:(\/\/)|^#/.exec(m[2]))
|
||||
{
|
||||
const p = m[2].indexOf('#');
|
||||
if (included[m[2]])
|
||||
{
|
||||
text[i] = m[1]+included[m[2]]+m[3];
|
||||
}
|
||||
else if (p >= 0 && included[m[2].substr(0, p)])
|
||||
{
|
||||
text[i] = m[1]+m[2].substr(p)+m[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log(text.join(''));
|
||||
}
|
||||
|
||||
function rel2abs(ref, rel)
|
||||
{
|
||||
rel = [ ...ref.replace(/^(.*)\/[^\/]+$/, '$1').split(/\/+/), ...rel.split(/\/+/) ];
|
||||
return killdots(rel).join('/');
|
||||
}
|
||||
|
||||
function abs2rel(ref, abs)
|
||||
{
|
||||
ref = ref.split(/\/+/);
|
||||
abs = abs.split(/\/+/);
|
||||
while (ref.length > 1 && ref[0] == abs[0])
|
||||
{
|
||||
ref.shift();
|
||||
abs.shift();
|
||||
}
|
||||
for (let i = 1; i < ref.length; i++)
|
||||
{
|
||||
abs.unshift('..');
|
||||
}
|
||||
return killdots(abs).join('/');
|
||||
}
|
||||
|
||||
function killdots(rel)
|
||||
{
|
||||
for (let i = 0; i < rel.length; i++)
|
||||
{
|
||||
if (rel[i] == '.')
|
||||
{
|
||||
rel.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
else if (i >= 1 && rel[i] == '..' && rel[i-1] != '..')
|
||||
{
|
||||
rel.splice(i-1, 2);
|
||||
i -= 2;
|
||||
}
|
||||
}
|
||||
return rel;
|
||||
}
|
65
docs/config/src/included.en.md
Normal file
65
docs/config/src/included.en.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# Vitastor
|
||||
|
||||
{{../../../README.md#The Idea}}
|
||||
|
||||
{{../../../README.md#Talks and presentations}}
|
||||
|
||||
{{../../intro/features.en.md}}
|
||||
|
||||
{{../../intro/quickstart.en.md}}
|
||||
|
||||
{{../../intro/architecture.en.md}}
|
||||
|
||||
## Installation
|
||||
|
||||
{{../../installation/packages.en.md}}
|
||||
|
||||
{{../../installation/proxmox.en.md}}
|
||||
|
||||
{{../../installation/openstack.en.md}}
|
||||
|
||||
{{../../installation/kubernetes.en.md}}
|
||||
|
||||
{{../../installation/source.en.md}}
|
||||
|
||||
{{../../config.en.md|indent=1}}
|
||||
|
||||
{{../../config/common.en.md|indent=2}}
|
||||
|
||||
{{../../config/network.en.md|indent=2}}
|
||||
|
||||
{{../../config/layout-cluster.en.md|indent=2}}
|
||||
|
||||
{{../../config/layout-osd.en.md|indent=2}}
|
||||
|
||||
{{../../config/osd.en.md|indent=2}}
|
||||
|
||||
{{../../config/monitor.en.md|indent=2}}
|
||||
|
||||
{{../../config/pool.en.md|indent=2}}
|
||||
|
||||
{{../../config/inode.en.md|indent=2}}
|
||||
|
||||
## Usage
|
||||
|
||||
{{../../usage/cli.en.md}}
|
||||
|
||||
{{../../usage/disk.en.md}}
|
||||
|
||||
{{../../usage/fio.en.md}}
|
||||
|
||||
{{../../usage/nbd.en.md}}
|
||||
|
||||
{{../../usage/qemu.en.md}}
|
||||
|
||||
{{../../usage/nfs.en.md}}
|
||||
|
||||
## Performance
|
||||
|
||||
{{../../performance/understanding.en.md}}
|
||||
|
||||
{{../../performance/theoretical.en.md}}
|
||||
|
||||
{{../../performance/comparison1.en.md}}
|
||||
|
||||
{{../../intro/author.en.md|indent=1}}
|
65
docs/config/src/included.ru.md
Normal file
65
docs/config/src/included.ru.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# Vitastor
|
||||
|
||||
{{../../../README-ru.md#Идея|indent=0}}
|
||||
|
||||
{{../../../README-ru.md#Презентации и записи докладов|indent=0}}
|
||||
|
||||
{{../../intro/features.ru.md}}
|
||||
|
||||
{{../../intro/quickstart.ru.md}}
|
||||
|
||||
{{../../intro/architecture.ru.md}}
|
||||
|
||||
## Установка
|
||||
|
||||
{{../../installation/packages.ru.md}}
|
||||
|
||||
{{../../installation/proxmox.ru.md}}
|
||||
|
||||
{{../../installation/openstack.ru.md}}
|
||||
|
||||
{{../../installation/kubernetes.ru.md}}
|
||||
|
||||
{{../../installation/source.ru.md}}
|
||||
|
||||
{{../../config.ru.md|indent=1}}
|
||||
|
||||
{{../../config/common.ru.md|indent=2}}
|
||||
|
||||
{{../../config/network.ru.md|indent=2}}
|
||||
|
||||
{{../../config/layout-cluster.ru.md|indent=2}}
|
||||
|
||||
{{../../config/layout-osd.ru.md|indent=2}}
|
||||
|
||||
{{../../config/osd.ru.md|indent=2}}
|
||||
|
||||
{{../../config/monitor.ru.md|indent=2}}
|
||||
|
||||
{{../../config/pool.ru.md|indent=2}}
|
||||
|
||||
{{../../config/inode.ru.md|indent=2}}
|
||||
|
||||
## Использование
|
||||
|
||||
{{../../usage/cli.ru.md}}
|
||||
|
||||
{{../../usage/disk.ru.md}}
|
||||
|
||||
{{../../usage/fio.ru.md}}
|
||||
|
||||
{{../../usage/nbd.ru.md}}
|
||||
|
||||
{{../../usage/qemu.ru.md}}
|
||||
|
||||
{{../../usage/nfs.ru.md}}
|
||||
|
||||
## Производительность
|
||||
|
||||
{{../../performance/understanding.ru.md}}
|
||||
|
||||
{{../../performance/theoretical.ru.md}}
|
||||
|
||||
{{../../performance/comparison1.ru.md}}
|
||||
|
||||
{{../../intro/author.ru.md|indent=1}}
|
@@ -14,6 +14,7 @@ const L = {
|
||||
toc_config: '[Configuration](../config.en.md)',
|
||||
toc_usage: 'Usage',
|
||||
toc_performance: 'Performance',
|
||||
online: 'Can be changed online: yes',
|
||||
},
|
||||
ru: {
|
||||
Documentation: 'Документация',
|
||||
@@ -28,6 +29,7 @@ const L = {
|
||||
toc_config: '[Конфигурация](../config.ru.md)',
|
||||
toc_usage: 'Использование',
|
||||
toc_performance: 'Производительность',
|
||||
online: 'Можно менять на лету: да',
|
||||
},
|
||||
};
|
||||
const types = {
|
||||
@@ -70,6 +72,8 @@ for (const file of params_files)
|
||||
out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`;
|
||||
if (c.min !== undefined)
|
||||
out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`;
|
||||
if (c.online)
|
||||
out += `- ${L[lang]['online'] || 'Can be changed online: yes'}\n`;
|
||||
out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, '');
|
||||
}
|
||||
const head = fs.readFileSync(__dirname+'/'+file+'.'+lang+'.md', { encoding: 'utf-8' });
|
||||
|
@@ -164,18 +164,21 @@
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: Interval before attempting to reconnect to an unavailable OSD.
|
||||
info_ru: Время ожидания перед повторной попыткой соединиться с недоступным OSD.
|
||||
- name: peer_connect_timeout
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: Timeout for OSD connection attempts.
|
||||
info_ru: Максимальное время ожидания попытки соединения с OSD.
|
||||
- name: osd_idle_timeout
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
OSD connection inactivity time after which clients and other OSDs send
|
||||
keepalive requests to check state of the connection.
|
||||
@@ -186,6 +189,7 @@
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
|
||||
within this time, the connection to it is dropped and a reconnection attempt
|
||||
@@ -198,6 +202,7 @@
|
||||
type: ms
|
||||
min: 50
|
||||
default: 500
|
||||
online: true
|
||||
info: |
|
||||
OSDs respond to clients with a special error code when they receive I/O
|
||||
requests for a PG that's not synchronized and started. This parameter sets
|
||||
@@ -211,6 +216,7 @@
|
||||
- name: max_etcd_attempts
|
||||
type: int
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
Maximum number of attempts for etcd requests which can't be retried
|
||||
indefinitely.
|
||||
@@ -220,6 +226,7 @@
|
||||
- name: etcd_quick_timeout
|
||||
type: ms
|
||||
default: 1000
|
||||
online: true
|
||||
info: |
|
||||
Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||
info_ru: |
|
||||
@@ -228,6 +235,7 @@
|
||||
- name: etcd_slow_timeout
|
||||
type: ms
|
||||
default: 5000
|
||||
online: true
|
||||
info: Timeout for etcd requests which are allowed to wait for some time.
|
||||
info_ru: |
|
||||
Максимальное время выполнения запросов к etcd, для которых не обязательно
|
||||
@@ -235,6 +243,7 @@
|
||||
- name: etcd_keepalive_timeout
|
||||
type: sec
|
||||
default: max(30, etcd_report_interval*2)
|
||||
online: true
|
||||
info: |
|
||||
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
|
||||
etcd_report_interval to guarantee that keepalive actually works.
|
||||
@@ -244,6 +253,7 @@
|
||||
- name: etcd_ws_keepalive_timeout
|
||||
type: sec
|
||||
default: 30
|
||||
online: true
|
||||
info: |
|
||||
etcd websocket ping interval required to keep the connection alive and
|
||||
detect disconnections quickly.
|
||||
@@ -252,6 +262,7 @@
|
||||
- name: client_dirty_limit
|
||||
type: int
|
||||
default: 33554432
|
||||
online: true
|
||||
info: |
|
||||
Without immediate_commit=all this parameter sets the limit of "dirty"
|
||||
(not committed by fsync) data allowed by the client before forcing an
|
||||
|
@@ -1,4 +1,5 @@
|
||||
# Runtime OSD Parameters
|
||||
|
||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||
initialization and can be changed with an OSD restart.
|
||||
initialization and can be changed - either with an OSD restart or, for some of
|
||||
them, even without restarting by updating configuration in etcd.
|
||||
|
@@ -2,4 +2,5 @@
|
||||
|
||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
||||
момент с перезапуском OSD.
|
||||
момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
|
||||
изменения конфигурации в etcd.
|
||||
|
@@ -66,6 +66,7 @@
|
||||
- name: autosync_interval
|
||||
type: sec
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
Time interval at which automatic fsyncs/flushes are issued by each OSD when
|
||||
the immediate_commit mode if disabled. fsyncs are required because without
|
||||
@@ -83,6 +84,7 @@
|
||||
- name: autosync_writes
|
||||
type: int
|
||||
default: 128
|
||||
online: true
|
||||
info: |
|
||||
Same as autosync_interval, but sets the maximum number of uncommitted write
|
||||
operations before issuing an fsync operation internally.
|
||||
@@ -93,6 +95,7 @@
|
||||
- name: recovery_queue_depth
|
||||
type: int
|
||||
default: 4
|
||||
online: true
|
||||
info: |
|
||||
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
@@ -105,6 +108,7 @@
|
||||
- name: recovery_pg_switch
|
||||
type: int
|
||||
default: 128
|
||||
online: true
|
||||
info: |
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
@@ -119,6 +123,7 @@
|
||||
- name: recovery_sync_batch
|
||||
type: int
|
||||
default: 16
|
||||
online: true
|
||||
info: Maximum number of recovery operations before issuing an additional fsync.
|
||||
info_ru: Максимальное число операций восстановления перед дополнительным fsync.
|
||||
- name: readonly
|
||||
@@ -133,6 +138,7 @@
|
||||
- name: no_recovery
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Disable automatic background recovery of objects. Note that it doesn't
|
||||
affect implicit recovery of objects happening during writes - a write is
|
||||
@@ -145,6 +151,7 @@
|
||||
- name: no_rebalance
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Disable background movement of data between different OSDs. Disabling it
|
||||
means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||
@@ -155,6 +162,7 @@
|
||||
- name: print_stats_interval
|
||||
type: sec
|
||||
default: 3
|
||||
online: true
|
||||
info: |
|
||||
Time interval at which OSDs print simple human-readable operation
|
||||
statistics on stdout.
|
||||
@@ -164,6 +172,7 @@
|
||||
- name: slow_log_interval
|
||||
type: sec
|
||||
default: 10
|
||||
online: true
|
||||
info: |
|
||||
Time interval at which OSDs dump slow or stuck operations on stdout, if
|
||||
they're any. Also it's the time after which an operation is considered
|
||||
@@ -175,6 +184,7 @@
|
||||
- name: inode_vanish_time
|
||||
type: sec
|
||||
default: 60
|
||||
online: true
|
||||
info: |
|
||||
Number of seconds after which a deleted inode is removed from OSD statistics.
|
||||
info_ru: |
|
||||
@@ -182,6 +192,7 @@
|
||||
- name: max_write_iodepth
|
||||
type: int
|
||||
default: 128
|
||||
online: true
|
||||
info: |
|
||||
Parallel client write operation limit per one OSD. Operations that exceed
|
||||
this limit are pushed to a temporary queue instead of being executed
|
||||
@@ -193,6 +204,7 @@
|
||||
- name: min_flusher_count
|
||||
type: int
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Flusher is a micro-thread that moves data from the journal to the data
|
||||
area of the device. Their number is auto-tuned between minimum and maximum.
|
||||
@@ -204,6 +216,7 @@
|
||||
- name: max_flusher_count
|
||||
type: int
|
||||
default: 256
|
||||
online: true
|
||||
info: |
|
||||
Maximum number of journal flushers (see above min_flusher_count).
|
||||
info_ru: |
|
||||
@@ -284,6 +297,7 @@
|
||||
- name: throttle_small_writes
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Enable soft throttling of small journaled writes. Useful for hybrid OSDs
|
||||
with fast journal/metadata devices and slow data devices. The idea is that
|
||||
@@ -312,6 +326,7 @@
|
||||
- name: throttle_target_iops
|
||||
type: int
|
||||
default: 100
|
||||
online: true
|
||||
info: |
|
||||
Target maximum number of throttled operations per second under the condition
|
||||
of full journal. Set it to approximate random write iops of your data devices
|
||||
@@ -324,6 +339,7 @@
|
||||
- name: throttle_target_mbs
|
||||
type: int
|
||||
default: 100
|
||||
online: true
|
||||
info: |
|
||||
Target maximum bandwidth in MB/s of throttled operations per second under
|
||||
the condition of full journal. Set it to approximate linear write
|
||||
@@ -336,6 +352,7 @@
|
||||
- name: throttle_target_parallelism
|
||||
type: int
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Target maximum parallelism of throttled operations under the condition of
|
||||
full journal. Set it to approximate internal parallelism of your data
|
||||
@@ -348,6 +365,7 @@
|
||||
- name: throttle_threshold_us
|
||||
type: us
|
||||
default: 50
|
||||
online: true
|
||||
info: |
|
||||
Minimal computed delay to be applied to throttled operations. Usually
|
||||
doesn't need to be changed.
|
||||
@@ -357,10 +375,151 @@
|
||||
- name: osd_memlock
|
||||
type: bool
|
||||
default: false
|
||||
info: >
|
||||
info: |
|
||||
Lock all OSD memory to prevent it from being unloaded into swap with
|
||||
mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||
info_ru: >
|
||||
info_ru: |
|
||||
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
|
||||
в пространство подкачки. Требует достаточного значения ulimit -l (лимита
|
||||
заблокированной памяти).
|
||||
- name: auto_scrub
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Data scrubbing is the process of background verification of copies to find
|
||||
and repair corrupted blocks. It's not run automatically by default since
|
||||
it's a new feature. Set this parameter to true to enable automatic scrubs.
|
||||
|
||||
This parameter makes OSDs automatically schedule data scrubbing of clean PGs
|
||||
every `scrub_interval` (see below). You can also start/schedule scrubbing
|
||||
manually by setting `next_scrub` JSON key to the desired UNIX time of the
|
||||
next scrub in `/pg/history/...` values in etcd.
|
||||
info_ru: |
|
||||
Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
|
||||
находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
|
||||
запускаются автоматически, так как являются новой функцией. Чтобы включить
|
||||
автоматическое планирование скрабов, установите данный параметр в true.
|
||||
|
||||
Включённый параметр заставляет OSD автоматически планировать фоновую
|
||||
проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
|
||||
запустить или запланировать проверку вручную, установив значение ключа JSON
|
||||
`next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
|
||||
желаемой проверки.
|
||||
- name: no_scrub
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Temporarily disable scrubbing and stop running scrubs.
|
||||
info_ru: |
|
||||
Временно отключить и остановить запущенные скрабы.
|
||||
- name: scrub_interval
|
||||
type: string
|
||||
default: 30d
|
||||
online: true
|
||||
info: |
|
||||
Default automatic scrubbing interval for all pools. Numbers without suffix
|
||||
are treated as seconds, possible unit suffixes include 's' (seconds),
|
||||
'm' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
|
||||
info_ru: |
|
||||
Интервал автоматической фоновой проверки по умолчанию для всех пулов.
|
||||
Значения без указанной единицы измерения считаются в секундах, допустимые
|
||||
символы единиц измерения в конце: 's' (секунды),
|
||||
'm' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
|
||||
- name: scrub_queue_depth
|
||||
type: int
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Number of parallel scrubbing operations per one OSD.
|
||||
info_ru: |
|
||||
Число параллельных операций фоновой проверки на один OSD.
|
||||
- name: scrub_sleep
|
||||
type: ms
|
||||
default: 0
|
||||
online: true
|
||||
info: |
|
||||
Additional interval between two consecutive scrubbing operations on one OSD.
|
||||
Can be used to slow down scrubbing if it affects user load too much.
|
||||
info_ru: |
|
||||
Дополнительный интервал ожидания после фоновой проверки каждого объекта на
|
||||
одном OSD. Может использоваться для замедления скраба, если он слишком
|
||||
сильно влияет на пользовательскую нагрузку.
|
||||
- name: scrub_list_limit
|
||||
type: int
|
||||
default: 1000
|
||||
online: true
|
||||
info: |
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
info_ru: |
|
||||
Размер загружаемых за одну операцию списков объектов в процессе фоновой
|
||||
проверки.
|
||||
- name: scrub_find_best
|
||||
type: bool
|
||||
default: true
|
||||
online: true
|
||||
info: |
|
||||
Find and automatically restore best versions of objects with unmatched
|
||||
copies. In replicated setups, the best version is the version with most
|
||||
matching replicas. In EC setups, the best version is the subset of data
|
||||
and parity chunks without mismatches.
|
||||
|
||||
The hypothetical situation where you might want to disable it is when
|
||||
you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
|
||||
corrupt an object in the same way (for example, zero it out) and only
|
||||
1 HDD will remain good. In this case disabling scrub_find_best may help
|
||||
you to recover the data! See also scrub_ec_max_bruteforce below.
|
||||
info_ru: |
|
||||
Находить и автоматически восстанавливать "лучшие версии" объектов с
|
||||
несовпадающими копиями/частями. При использовании репликации "лучшая"
|
||||
версия - версия, доступная в большем числе экземпляров, чем другие. При
|
||||
использовании кодов коррекции ошибок "лучшая" версия - это подмножество
|
||||
частей данных и чётности, полностью соответствующих друг другу.
|
||||
|
||||
Гипотетическая ситуация, в которой вы можете захотеть отключить этот
|
||||
поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
|
||||
незаметно и одинаково повредить данные одного и того же объекта, например,
|
||||
занулив его, и только 1 диск останется неповреждённым. В этой ситуации
|
||||
отключение этого параметра поможет вам восстановить данные! Смотрите также
|
||||
описание следующего параметра - scrub_ec_max_bruteforce.
|
||||
- name: scrub_ec_max_bruteforce
|
||||
type: int
|
||||
default: 100
|
||||
online: true
|
||||
info: |
|
||||
Vitastor can locate corrupted chunks in EC setups with more than 1 parity
|
||||
chunk by brute-forcing all possible error locations. This configuration
|
||||
value limits the maximum number of checked combinations. You can try to
|
||||
increase it if you have EC N+K setup with N and K large enough for
|
||||
combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
|
||||
than the default 100.
|
||||
|
||||
If there are too many possible combinations or if multiple combinations give
|
||||
correct results then objects are marked inconsistent and aren't recovered
|
||||
automatically.
|
||||
|
||||
In replicated setups bruteforcing isn't needed, Vitastor just assumes that
|
||||
the variant with most available equal copies is correct. For example, if
|
||||
you have 3 replicas and 1 of them differs, this one is considered to be
|
||||
corrupted. But if there is no "best" version with more copies than all
|
||||
others have then the object is also marked as inconsistent.
|
||||
info_ru: |
|
||||
Vitastor старается определить повреждённые части объектов при использовании
|
||||
EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
|
||||
всех возможных комбинаций ошибочных частей. Данное значение конфигурации
|
||||
ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
|
||||
его, если используете схему кодирования EC N+K с N и K, достаточно большими
|
||||
для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
|
||||
было больше, чем стандартное значение 100.
|
||||
|
||||
Если возможных комбинаций слишком много или если корректная комбинаций не
|
||||
определяется однозначно, объекты помечаются неконсистентными (inconsistent)
|
||||
и не восстанавливаются автоматически.
|
||||
|
||||
При использовании репликации перебор не нужен, Vitastor просто предполагает,
|
||||
что вариант объекта с наибольшим количеством одинаковых копий корректен.
|
||||
Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
|
||||
считается некорректной. Однако, если "лучшую" версию с числом доступных
|
||||
копий большим, чем у всех других версий, найти невозможно, то объект тоже
|
||||
маркируется неконсистентным.
|
||||
|
@@ -8,13 +8,13 @@
|
||||
|
||||
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO, а также блочные RWX, тома.
|
||||
|
||||
Для установки возьмите манифесты из директории [csi/deploy/](../csi/deploy/), поместите
|
||||
вашу конфигурацию подключения к Vitastor в [csi/deploy/001-csi-config-map.yaml](../csi/deploy/001-csi-config-map.yaml),
|
||||
настройте StorageClass в [csi/deploy/009-storage-class.yaml](../csi/deploy/009-storage-class.yaml)
|
||||
Для установки возьмите манифесты из директории [csi/deploy/](../../csi/deploy/), поместите
|
||||
вашу конфигурацию подключения к Vitastor в [csi/deploy/001-csi-config-map.yaml](../../csi/deploy/001-csi-config-map.yaml),
|
||||
настройте StorageClass в [csi/deploy/009-storage-class.yaml](../../csi/deploy/009-storage-class.yaml)
|
||||
и примените все `NNN-*.yaml` к вашей инсталляции Kubernetes.
|
||||
|
||||
```
|
||||
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
||||
```
|
||||
|
||||
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../csi/deploy/example-pvc.yaml).
|
||||
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
|
||||
|
@@ -36,5 +36,5 @@ vitastor_pool_id = 1
|
||||
image_upload_use_cinder_backend = True
|
||||
```
|
||||
|
||||
To put Glance images in Vitastor, use [https://docs.openstack.org/cinder/pike/admin/blockstorage-volume-backed-image.html](volume-backed images),
|
||||
To put Glance images in Vitastor, use [volume-backed images](https://docs.openstack.org/cinder/pike/admin/blockstorage-volume-backed-image.html),
|
||||
although the support has not been verified yet.
|
||||
|
@@ -36,5 +36,5 @@ image_upload_use_cinder_backend = True
|
||||
```
|
||||
|
||||
Чтобы помещать в Vitastor Glance-образы, нужно использовать
|
||||
[https://docs.openstack.org/cinder/pike/admin/blockstorage-volume-backed-image.html](образы на основе томов Cinder),
|
||||
[образы на основе томов Cinder](https://docs.openstack.org/cinder/pike/admin/blockstorage-volume-backed-image.html),
|
||||
однако, поддержка этой функции ещё не проверялась.
|
||||
|
@@ -11,7 +11,8 @@
|
||||
- Trust Vitastor package signing key:
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- For Debian 10 (Buster) also enable backports repository:
|
||||
`deb http://deb.debian.org/debian buster-backports main`
|
||||
@@ -45,3 +46,10 @@
|
||||
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
|
||||
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
||||
- node.js 10 or newer
|
||||
|
||||
## Version archive
|
||||
|
||||
All previous Vitastor and other components (QEMU, etcd...) package builds
|
||||
can be found here:
|
||||
|
||||
https://vitastor.io/archive/
|
||||
|
@@ -11,7 +11,8 @@
|
||||
- Добавьте ключ репозитория Vitastor:
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Для Debian 10 (Buster) также включите репозиторий backports:
|
||||
`deb http://deb.debian.org/debian buster-backports main`
|
||||
@@ -44,3 +45,10 @@
|
||||
- etcd 3.4.15 или новее. Более старые версии не будут работать из-за разных багов,
|
||||
например, [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
||||
- node.js 10 или новее
|
||||
|
||||
## Архив предыдущих версий
|
||||
|
||||
Все предыдущие сборки пакетов Vitastor и других компонентов, таких, как QEMU
|
||||
и etcd, можно скачать по следующей ссылке:
|
||||
|
||||
https://vitastor.io/archive/
|
||||
|
@@ -6,10 +6,10 @@
|
||||
|
||||
# Proxmox VE
|
||||
|
||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-7.4 are supported):
|
||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.0 are supported):
|
||||
|
||||
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
||||
buster for 6.4, bullseye for 7.4, pve7.1 for 7.1, pve7.2 for 7.2, pve7.3 for 7.3
|
||||
bookworm for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
|
||||
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
||||
- Define storage in `/etc/pve/storage.cfg` (see below)
|
||||
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
||||
@@ -35,5 +35,5 @@ vitastor: vitastor
|
||||
vitastor_nbd 0
|
||||
```
|
||||
|
||||
\* Note: you can also manually copy [patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) to Proxmox hosts
|
||||
\* Note: you can also manually copy [patches/VitastorPlugin.pm](../../patches/VitastorPlugin.pm) to Proxmox hosts
|
||||
as `/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm` instead of installing pve-storage-vitastor.
|
||||
|
@@ -4,12 +4,12 @@
|
||||
|
||||
[Read in English](proxmox.en.md)
|
||||
|
||||
# Proxmox
|
||||
# Proxmox VE
|
||||
|
||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-7.4):
|
||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.0):
|
||||
|
||||
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
||||
buster для 6.4, bullseye для 7.4, pve7.1 для 7.1, pve7.2 для 7.2, pve7.3 для 7.3
|
||||
bookworm для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
|
||||
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
||||
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
||||
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
||||
@@ -35,5 +35,5 @@ vitastor: vitastor
|
||||
```
|
||||
|
||||
\* Примечание: вместо установки пакета pve-storage-vitastor вы можете вручную скопировать файл
|
||||
[patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) на хосты Proxmox как
|
||||
[patches/VitastorPlugin.pm](../../patches/VitastorPlugin.pm) на хосты Proxmox как
|
||||
`/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm`.
|
||||
|
@@ -44,7 +44,7 @@
|
||||
depends linearly on drive capacity and data store block size which is 128 KB by default.
|
||||
With 128 KB blocks metadata takes around 512 MB per 1 TB (which is still less than Ceph wants).
|
||||
Journal is also kept in memory by default, but in SSD-only clusters it's only 32 MB, and in SSD+HDD
|
||||
clusters, where it's beneficial to increase it, [inmemory_journal](docs/config/osd.en.md#inmemory_journal) can be disabled.
|
||||
clusters, where it's beneficial to increase it, [inmemory_journal](../config/osd.en.md#inmemory_journal) can be disabled.
|
||||
- Vitastor storage layer doesn't have internal copy-on-write or redirect-write. I know that maybe
|
||||
it's possible to create a good copy-on-write storage, but it's much harder and makes performance
|
||||
less deterministic, so CoW isn't used in Vitastor.
|
||||
|
@@ -156,7 +156,7 @@
|
||||
блока хранилища (block_size, по умолчанию 128 КБ). С 128 КБ блоком потребление памяти
|
||||
составляет примерно 512 МБ на 1 ТБ данных. Журналы по умолчанию тоже хранятся в памяти,
|
||||
но в SSD-кластерах нужный размер журнала составляет всего 32 МБ, а в гибридных (SSD+HDD)
|
||||
кластерах, в которых есть смысл делать журналы больше, можно отключить [inmemory_journal](../docs/config/osd.ru.md#inmemory_journal).
|
||||
кластерах, в которых есть смысл делать журналы больше, можно отключить [inmemory_journal](../config/osd.ru.md#inmemory_journal).
|
||||
- В Vitastor нет внутреннего copy-on-write. Я считаю, что реализация CoW-хранилища гораздо сложнее,
|
||||
поэтому сложнее добиться устойчиво хороших результатов. Возможно, в один прекрасный день
|
||||
я придумаю красивый алгоритм для CoW-хранилища, но пока нет — внутреннего CoW в Vitastor не будет.
|
||||
|
@@ -29,12 +29,13 @@
|
||||
- Snapshots and copy-on-write image clones
|
||||
- [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
|
||||
- [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
|
||||
- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)
|
||||
|
||||
## Plugins and tools
|
||||
|
||||
- [Debian and CentOS packages](../installation/packages.en.md)
|
||||
- [Image management CLI (vitastor-cli)](../usage/cli.en.md)
|
||||
- [Disk management CLI (vitastor-disk)](docs/usage/disk.en.md)
|
||||
- [Disk management CLI (vitastor-disk)](../usage/disk.en.md)
|
||||
- Generic user-space client library
|
||||
- [Native QEMU driver](../usage/qemu.en.md)
|
||||
- [Loadable fio engine for benchmarks](../usage/fio.en.md)
|
||||
@@ -54,7 +55,6 @@ The following features are planned for the future:
|
||||
- iSCSI proxy
|
||||
- Multi-threaded client
|
||||
- Faster failover
|
||||
- Scrubbing without checksums (verification of replicas)
|
||||
- Checksums
|
||||
- Tiered storage (SSD caching)
|
||||
- NVDIMM support
|
||||
|
@@ -13,7 +13,7 @@
|
||||
## Серверные функции
|
||||
|
||||
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
|
||||
- [Производительность](../comparison1.ru.md) ;-D
|
||||
- [Производительность](../performance/comparison1.ru.md) ;-D
|
||||
- [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
|
||||
Рида-Соломона на основе библиотек jerasure и ISA-L с любым числом дисков данных и чётности в группе
|
||||
- Конфигурация через простые человекочитаемые JSON-структуры в etcd
|
||||
@@ -31,12 +31,13 @@
|
||||
- Снапшоты и copy-on-write клоны
|
||||
- [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
|
||||
- [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
|
||||
- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)
|
||||
|
||||
## Драйверы и инструменты
|
||||
|
||||
- [Пакеты для Debian и CentOS](../installation/packages.ru.md)
|
||||
- [Консольный интерфейс управления образами (vitastor-cli)](../usage/cli.ru.md)
|
||||
- [Инструмент управления дисками (vitastor-disk)](docs/usage/disk.ru.md)
|
||||
- [Инструмент управления дисками (vitastor-disk)](../usage/disk.ru.md)
|
||||
- Общая пользовательская клиентская библиотека для работы с кластером
|
||||
- [Драйвер диска для QEMU](../usage/qemu.ru.md)
|
||||
- [Драйвер диска для утилиты тестирования производительности fio](../usage/fio.ru.md)
|
||||
@@ -54,7 +55,6 @@
|
||||
- iSCSI-прокси
|
||||
- Многопоточный клиент
|
||||
- Более быстрое переключение при отказах
|
||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||
- Контрольные суммы
|
||||
- Поддержка SSD-кэширования (tiered storage)
|
||||
- Поддержка NVDIMM
|
||||
|
@@ -20,6 +20,8 @@ It supports the following commands:
|
||||
- [flatten](#flatten)
|
||||
- [rm-data](#rm-data)
|
||||
- [merge-data](#merge-data)
|
||||
- [describe](#describe)
|
||||
- [fix](#fix)
|
||||
- [alloc-osd](#alloc-osd)
|
||||
- [rm-osd](#rm-osd)
|
||||
|
||||
@@ -174,6 +176,51 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
|
||||
`<to>` must be a child of `<from>` and `<target>` may be one of the layers between
|
||||
`<from>` and `<to>`, including `<from>` and `<to>`.
|
||||
|
||||
## describe
|
||||
|
||||
`vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>]
|
||||
[--inode <ino>] [--min-inode <ino>] [--max-inode <ino>]
|
||||
[--min-offset <offset>] [--max-offset <offset>]`
|
||||
|
||||
Describe unclean object locations in the cluster.
|
||||
|
||||
```
|
||||
--osds <osds>
|
||||
Only list objects from primary OSD(s) <osds>.
|
||||
--object-state <states>
|
||||
Only list objects in given state(s). State(s) may include:
|
||||
degraded, misplaced, incomplete, corrupted, inconsistent.
|
||||
--pool <pool name or number>
|
||||
Only list objects in the given pool.
|
||||
--inode, --min-inode, --max-inode
|
||||
Restrict listing to specific inode numbers.
|
||||
--min-offset, --max-offset
|
||||
Restrict listing to specific offsets inside inodes.
|
||||
```
|
||||
|
||||
## fix
|
||||
|
||||
`vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]`
|
||||
|
||||
Fix inconsistent objects in the cluster by deleting some copies.
|
||||
|
||||
```
|
||||
--objects <objects>
|
||||
Objects to fix, either in plain text or JSON format. If not specified,
|
||||
object list will be read from STDIN in one of the same formats.
|
||||
Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...
|
||||
JSON format: [{"inode":"0x...","stripe":"0x..."},...]
|
||||
--bad-osds <osds>
|
||||
Remove inconsistent copies/parts of objects from these OSDs, effectively
|
||||
marking them bad and allowing Vitastor to recover objects from other copies.
|
||||
--part <number>
|
||||
Only remove EC part <number> (from 0 to pg_size-1), required for extreme
|
||||
edge cases where one OSD has multiple parts of a EC object.
|
||||
--check no
|
||||
Do not recheck that requested objects are actually inconsistent,
|
||||
delete requested copies/parts anyway.
|
||||
```
|
||||
|
||||
## alloc-osd
|
||||
|
||||
`vitastor-cli alloc-osd`
|
||||
|
@@ -184,6 +184,59 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
в целевой образ `<target>`. `<to>` должен быть дочерним образом `<from>`, а `<target>`
|
||||
должен быть одним из слоёв между `<from>` и `<to>`, включая сами `<from>` и `<to>`.
|
||||
|
||||
## describe
|
||||
|
||||
`vitastor-cli describe [--osds <osds>] [--object-state <состояния>] [--pool <пул>]
|
||||
[--inode <номер>] [--min-inode <номер>] [--max-inode <номер>]
|
||||
[--min-offset <смещение>] [--max-offset <смещение>]`
|
||||
|
||||
Описать состояние "грязных" объектов в кластере, то есть таких объектов, копии
|
||||
или части которых хранятся на наборе OSD, не равном целевому.
|
||||
|
||||
```
|
||||
--osds <osds>
|
||||
Перечислять только объекты с первичных OSD из списка <osds>.
|
||||
--object-state <состояния>
|
||||
Перечислять только объекты в указанных состояниях. Возможные состояния
|
||||
объектов:
|
||||
- degraded - деградированная избыточность
|
||||
- misplaced - перемещённый
|
||||
- incomplete - нечитаемый из-за потери большего числа частей, чем допустимо
|
||||
- corrupted - с одной или более повреждённой частью
|
||||
- inconsistent - неконсистентный, с неоднозначным расхождением копий/частей
|
||||
--pool <имя или ID пула>
|
||||
Перечислять только объекты из заданного пула.
|
||||
--inode, --min-inode, --max-inode
|
||||
Перечислять только объекты из указанных номеров инодов (образов).
|
||||
--min-offset, --max-offset
|
||||
Перечислять только объекты с заданных смещений внутри образов.
|
||||
```
|
||||
|
||||
## fix
|
||||
|
||||
`vitastor-cli fix [--objects <объекты>] [--bad-osds <osds>] [--part <номер>] [--check no]`
|
||||
|
||||
Исправить неконсистентные (неоднозначные) объекты путём удаления части копий.
|
||||
|
||||
```
|
||||
--objects <объекты>
|
||||
Объекты для исправления - в простом текстовом или JSON формате. Если опция
|
||||
не указана, список объектов читается со стандартного ввода в тех же форматах.
|
||||
Простой формат: 0x<инод>:0x<смещение> <любой разделитель> 0x<инод>:0x<смещение> ...
|
||||
Формат JSON: [{"inode":"0x<инод>","stripe":"0x<смещение>"},...]
|
||||
--bad-osds <osds>
|
||||
Удалить неконсистентные копии/части объектов с данных OSD, таким образом
|
||||
признавая потерю этих копий и позволяя Vitastor-у восстановить объекты из
|
||||
других копий.
|
||||
--part <номер>
|
||||
Удалить только части EC с заданным номером (от 0 до pg_size-1). Нужно только
|
||||
в редких граничных случаях, когда один и тот же OSD содержит несколько частей
|
||||
одного EC-объекта.
|
||||
--check no
|
||||
Не перепроверять, что заданные объекты действительно в неконсистентном
|
||||
состоянии и просто удалять заданные части.
|
||||
```
|
||||
|
||||
## alloc-osd
|
||||
|
||||
`vitastor-cli alloc-osd`
|
||||
|
@@ -25,6 +25,23 @@ It will output a block device name like /dev/nbd0 which you can then use as a no
|
||||
|
||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
||||
|
||||
Additional options for map command:
|
||||
|
||||
* `--nbd_timeout 30` \
|
||||
Timeout for I/O operations in seconds after exceeding which the kernel stops
|
||||
the device. You can set it to 0 to disable the timeout, but beware that you
|
||||
won't be able to stop the device at all if vitastor-nbd process dies.
|
||||
* `--nbd_max_devices 64 --nbd_max_part 3` \
|
||||
Options for the `nbd` kernel module when modprobing it (`nbds_max` and `max_part`).
|
||||
note that maximum allowed (nbds_max)*(1+max_part) is 256.
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Write log messages to the specified file instead of dropping them (in background mode)
|
||||
or printing them to the standard output (in foreground mode).
|
||||
* `--dev_num N` \
|
||||
Use the specified device /dev/nbdN instead of automatic selection.
|
||||
* `--foreground 1` \
|
||||
Stay in foreground, do not daemonize.
|
||||
|
||||
## Unmap image
|
||||
|
||||
To unmap the device run:
|
||||
@@ -32,3 +49,27 @@ To unmap the device run:
|
||||
```
|
||||
vitastor-nbd unmap /dev/nbd0
|
||||
```
|
||||
|
||||
## List mapped images
|
||||
|
||||
```
|
||||
vitastor-nbd ls [--json]
|
||||
```
|
||||
|
||||
Example output (normal format):
|
||||
|
||||
```
|
||||
/dev/nbd0
|
||||
image: bench
|
||||
pid: 584536
|
||||
|
||||
/dev/nbd1
|
||||
image: bench1
|
||||
pid: 584546
|
||||
```
|
||||
|
||||
Example output (JSON format):
|
||||
|
||||
```
|
||||
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
||||
|
@@ -30,6 +30,27 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
Дополнительные опции для команды подключения NBD-устройства:
|
||||
|
||||
* `--nbd_timeout 30` \
|
||||
Максимальное время выполнения любой операции чтения/записи в секундах, при
|
||||
превышении которого ядро остановит NBD-устройство. Вы можете установить опцию
|
||||
в 0, чтобы отключить ограничение времени, но имейте в виду, что в этом случае
|
||||
вы вообще не сможете отключить NBD-устройство при нештатном завершении процесса
|
||||
vitastor-nbd.
|
||||
* `--nbd_max_devices 64 --nbd_max_part 3` \
|
||||
Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd
|
||||
(`nbds_max` и `max_part`). Имейте в виду, что (nbds_max)*(1+max_part)
|
||||
обычно не должно превышать 256.
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Писать сообщения о процессе работы в заданный файл, вместо пропуска их
|
||||
при фоновом режиме запуска или печати на стандартный вывод при запуске
|
||||
в консоли с `--foreground 1`.
|
||||
* `--dev_num N` \
|
||||
Использовать заданное устройство `/dev/nbdN` вместо автоматического подбора.
|
||||
* `--foreground 1` \
|
||||
Не уводить процесс в фоновый режим.
|
||||
|
||||
## Отключить устройство
|
||||
|
||||
Для отключения устройства выполните:
|
||||
@@ -37,3 +58,27 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
```
|
||||
vitastor-nbd unmap /dev/nbd0
|
||||
```
|
||||
|
||||
## Вывести подключённые устройства
|
||||
|
||||
```
|
||||
vitastor-nbd ls [--json]
|
||||
```
|
||||
|
||||
Пример вывода в обычном формате:
|
||||
|
||||
```
|
||||
/dev/nbd0
|
||||
image: bench
|
||||
pid: 584536
|
||||
|
||||
/dev/nbd1
|
||||
image: bench1
|
||||
pid: 584546
|
||||
```
|
||||
|
||||
Пример вывода в JSON-формате:
|
||||
|
||||
```
|
||||
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
||||
|
@@ -29,7 +29,7 @@ vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
|
||||
--bind <IP> принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех)
|
||||
--nfspath <PATH> установить путь NFS-экспорта в <PATH> (по умолчанию /)
|
||||
--port <PORT> использовать порт <PORT> для NFS-сервисов (по умолчанию 2049)
|
||||
--pool <POOL> исползовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
|
||||
--pool <POOL> использовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
|
||||
--foreground 1 не уходить в фон после запуска
|
||||
```
|
||||
|
||||
|
@@ -50,8 +50,7 @@ async function lp_solve(text)
|
||||
return { score, vars };
|
||||
}
|
||||
|
||||
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, hier_sizes = null,
|
||||
max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
|
||||
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
|
||||
{
|
||||
if (!pg_count || !osd_tree)
|
||||
{
|
||||
@@ -59,7 +58,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
|
||||
}
|
||||
const all_weights = Object.assign({}, ...Object.values(osd_tree));
|
||||
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
const all_pgs = Object.values(random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout));
|
||||
const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
|
||||
const pg_per_osd = {};
|
||||
for (const pg of all_pgs)
|
||||
{
|
||||
@@ -217,45 +216,39 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
|
||||
return move_weights;
|
||||
}
|
||||
|
||||
function build_parent_per_leaf(osd_tree, res = {}, parents = [])
|
||||
{
|
||||
for (const item in osd_tree)
|
||||
{
|
||||
if (osd_tree[item] instanceof Object)
|
||||
build_parent_per_leaf(osd_tree[item], res, [ ...parents, item ]);
|
||||
else
|
||||
res[item] = parents;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
function add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes)
|
||||
function add_valid_previous(osd_tree, prev_weights, all_pgs)
|
||||
{
|
||||
// Add previous combinations that are still valid
|
||||
const parent_per_osd = build_parent_per_leaf(osd_tree);
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const host_per_osd = {};
|
||||
for (const host in osd_tree)
|
||||
{
|
||||
for (const osd in osd_tree[host])
|
||||
{
|
||||
host_per_osd[osd] = host;
|
||||
}
|
||||
}
|
||||
skip_pg: for (const pg_name in prev_weights)
|
||||
{
|
||||
const seen = [];
|
||||
const seen_hosts = {};
|
||||
const pg = pg_name.substr(3).split(/_/);
|
||||
for (const osd of pg)
|
||||
{
|
||||
if (!parent_per_osd[osd])
|
||||
continue skip_pg;
|
||||
for (let i = 0; i < parent_per_osd[osd].length; i++)
|
||||
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
|
||||
{
|
||||
seen[parent_per_osd[osd][i]]++;
|
||||
if (seen[parent_per_osd[osd][i]] > hier_sizes[i])
|
||||
continue skip_pg;
|
||||
continue skip_pg;
|
||||
}
|
||||
seen_hosts[host_per_osd[osd]] = true;
|
||||
}
|
||||
if (!all_pgs[pg_name])
|
||||
{
|
||||
all_pgs[pg_name] = pg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to minimize data movement
|
||||
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2,
|
||||
hier_sizes = null, max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
|
||||
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
|
||||
{
|
||||
if (!osd_tree)
|
||||
{
|
||||
@@ -280,10 +273,10 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
|
||||
}
|
||||
const old_pg_size = prev_int_pgs[0].length;
|
||||
// Get all combinations
|
||||
let all_pgs = random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout);
|
||||
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
|
||||
if (old_pg_size == pg_size)
|
||||
{
|
||||
add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes || [ pg_size, 1 ]);
|
||||
add_valid_previous(osd_tree, prev_weights, all_pgs);
|
||||
}
|
||||
all_pgs = Object.values(all_pgs);
|
||||
const pg_per_osd = {};
|
||||
@@ -509,147 +502,41 @@ function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
|
||||
}
|
||||
}
|
||||
|
||||
// Convert multi-level tree_node = { level: number|string, id?: string, size?: number, children?: tree_node[] }
|
||||
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
|
||||
// levels = { string: number }
|
||||
// to a multi-level OSD tree suitable for random_hier_combinations()
|
||||
// (or in case of just 2 levels - for all_combinations() / random_combinations())
|
||||
//
|
||||
// Example:
|
||||
// tree_node = { level: 'dc', children: [ { level: 'rack', children: [ { level: 'host', children: [ { level: 'osd', size: 10 } ] } ] } ] }
|
||||
// extract_levels = [ 'rack', 'osd' ]
|
||||
// level_defs = { dc: 1, rack: 2, host: 3, osd: 4 }
|
||||
//
|
||||
// Result:
|
||||
// { rack0: { osd1: 10 } }
|
||||
function extract_tree_levels(tree_node, extract_levels, level_defs, new_tree = { idx: 1, items: {} })
|
||||
// to a two-level osd_tree suitable for all_combinations()
|
||||
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
|
||||
{
|
||||
const next_level = Number(level_defs[extract_levels[0]] || extract_levels[0]) || 0;
|
||||
const level_name = level_defs[extract_levels[0]] ? extract_levels[0] : 'l'+extract_levels[0]+'_';
|
||||
const is_leaf = extract_levels.length == 1;
|
||||
if ((level_defs[tree_node.level] || tree_node.level) >= next_level)
|
||||
osd_level = levels[osd_level] || osd_level;
|
||||
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
|
||||
for (const node of osd_tree)
|
||||
{
|
||||
if (!is_leaf)
|
||||
if ((levels[node.level] || node.level) < failure_domain_level)
|
||||
{
|
||||
// Insert a (possibly fake) level
|
||||
const nt = { idx: 1, items: {} };
|
||||
new_tree.items[level_name+(new_tree.idx++)] = nt.items;
|
||||
extract_tree_levels(tree_node, extract_levels.slice(1), level_defs, nt);
|
||||
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Insert a leaf node
|
||||
const leaf_id = tree_node.id || (level_name+(new_tree.idx++));
|
||||
new_tree.items[leaf_id] = tree_node.size;
|
||||
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (const child_node of tree_node.children||[])
|
||||
{
|
||||
extract_tree_levels(child_node, extract_levels, level_defs, new_tree);
|
||||
}
|
||||
}
|
||||
return new_tree.items;
|
||||
return domains;
|
||||
}
|
||||
|
||||
// generate random PGs with hierarchical failure domains, i.e. for example 3 DC each with 2 HOSTS
|
||||
// osd_tree = { level3_id: { level2_id: { level1_id: scalar_value } }, ... }
|
||||
// osd_tree may contain arbitrary number of levels, but level count must be the same across the whole tree
|
||||
// size_per_level = number of items to select on each level, for example [3, 2, 1].
|
||||
// must have the same number of items as the osd_tree level count.
|
||||
// count = PG count to generate
|
||||
// ordered = don't treat (x,y) and (y,x) as equal
|
||||
// seq_layout = true for the [DC1,DC1,DC2,DC2,DC3,DC3] layout, false for [DC1,DC2,DC3,DC1,DC2,DC3] layout
|
||||
function random_hier_combinations(osd_tree, size_per_level, count, ordered, seq_layout)
|
||||
function extract_osds(osd_tree, levels, osd_level, osds = {})
|
||||
{
|
||||
let seed = 0x5f020e43;
|
||||
const rng = () =>
|
||||
for (const node of osd_tree)
|
||||
{
|
||||
seed ^= seed << 13;
|
||||
seed ^= seed >> 17;
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
};
|
||||
const get_max_level = (o) =>
|
||||
{
|
||||
let lvl = 0;
|
||||
while (o instanceof Object)
|
||||
if ((levels[node.level] || node.level) >= osd_level)
|
||||
{
|
||||
for (const k in o)
|
||||
{
|
||||
lvl++;
|
||||
o = o[k];
|
||||
break;
|
||||
}
|
||||
osds[node.id] = node.size;
|
||||
}
|
||||
return lvl;
|
||||
};
|
||||
const max_level = get_max_level(osd_tree);
|
||||
const gen_pg = (select) =>
|
||||
{
|
||||
let pg = [ osd_tree ];
|
||||
for (let level = 0; level < max_level; level++)
|
||||
else
|
||||
{
|
||||
let npg = [];
|
||||
for (let i = 0; i < pg.length; i++)
|
||||
{
|
||||
const keys = pg[i] instanceof Object ? Object.keys(pg[i]) : [];
|
||||
const max_keys = keys.length < size_per_level[level] ? keys.length : size_per_level[level];
|
||||
for (let j = 0; j < max_keys; j++)
|
||||
{
|
||||
const r = select(level, i, j, (ordered ? keys.length : (keys.length - (max_keys - j - 1))));
|
||||
const el = pg[i][keys[r]] instanceof Object ? pg[i][keys[r]] : keys[r];
|
||||
npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = el;
|
||||
keys.splice(ordered ? r : 0, ordered ? 1 : (r+1));
|
||||
}
|
||||
for (let j = max_keys; j < size_per_level[level]; j++)
|
||||
npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = NO_OSD;
|
||||
}
|
||||
pg = npg;
|
||||
extract_osds(node.children||[], levels, osd_level, osds);
|
||||
}
|
||||
return pg;
|
||||
};
|
||||
const r = {};
|
||||
// Generate random combinations including each OSD at least once
|
||||
let has_next = true;
|
||||
let ctr = [];
|
||||
while (has_next)
|
||||
{
|
||||
let pg = gen_pg((level, i, j, n) =>
|
||||
{
|
||||
if (i == 0 && j == 0)
|
||||
{
|
||||
// Select a pre-determined OSD in the first position on each level
|
||||
const r = ctr[level] == null || ctr[level][1] != n ? 0 : ctr[level][0];
|
||||
ctr[level] = [ r, n ];
|
||||
return r;
|
||||
}
|
||||
return rng() % n;
|
||||
});
|
||||
for (let i = ctr.length-1; i >= 0; i--)
|
||||
{
|
||||
ctr[i][0]++;
|
||||
if (ctr[i][0] < ctr[i][1])
|
||||
break;
|
||||
else
|
||||
ctr[i] = null;
|
||||
}
|
||||
has_next = ctr[0] != null;
|
||||
const cyclic_pgs = [ pg ];
|
||||
if (ordered)
|
||||
for (let i = 1; i < pg.size; i++)
|
||||
cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
|
||||
for (const pg of cyclic_pgs)
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
}
|
||||
// Generate purely random combinations
|
||||
while (count > 0)
|
||||
{
|
||||
let pg = gen_pg((l, i, j, n) => rng() % n);
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
count--;
|
||||
}
|
||||
return r;
|
||||
return osds;
|
||||
}
|
||||
|
||||
// ordered = don't treat (x,y) and (y,x) as equal
|
||||
@@ -865,12 +752,11 @@ module.exports = {
|
||||
pg_weights_space_efficiency,
|
||||
pg_list_space_efficiency,
|
||||
pg_per_osd_space_efficiency,
|
||||
extract_tree_levels,
|
||||
flatten_tree,
|
||||
|
||||
lp_solve,
|
||||
make_int_pgs,
|
||||
align_pgs,
|
||||
random_combinations,
|
||||
random_hier_combinations,
|
||||
all_combinations,
|
||||
};
|
||||
|
@@ -63,7 +63,7 @@ Wants=network-online.target local-fs.target time-sync.target
|
||||
|
||||
[Service]
|
||||
Restart=always
|
||||
ExecStart=/usr/local/bin/etcd -name etcd${num} --data-dir /var/lib/etcd${num}.etcd \\
|
||||
ExecStart=etcd -name etcd${num} --data-dir /var/lib/etcd${num}.etcd \\
|
||||
--advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
|
||||
--initial-advertise-peer-urls http://${etcds[num]}:2380 --listen-peer-urls http://${etcds[num]}:2380 \\
|
||||
--initial-cluster-token vitastor-etcd-1 --initial-cluster ${etcd_cluster} \\
|
||||
|
202
mon/mon.js
202
mon/mon.js
@@ -104,12 +104,21 @@ const etcd_tree = {
|
||||
autosync_writes: 128,
|
||||
client_queue_depth: 128, // unused
|
||||
recovery_queue_depth: 4,
|
||||
recovery_pg_switch: 128,
|
||||
recovery_sync_batch: 16,
|
||||
no_recovery: false,
|
||||
no_rebalance: false,
|
||||
print_stats_interval: 3,
|
||||
slow_log_interval: 10,
|
||||
inode_vanish_time: 60,
|
||||
auto_scrub: false,
|
||||
no_scrub: false,
|
||||
scrub_interval: '30d', // 1s/1m/1h/1d
|
||||
scrub_queue_depth: 1,
|
||||
scrub_sleep: 0, // milliseconds
|
||||
scrub_list_limit: 1000, // objects to list on one scrub iteration
|
||||
scrub_find_best: true,
|
||||
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
|
||||
// blockstore - fixed in superblock
|
||||
block_size,
|
||||
disk_alignment,
|
||||
@@ -159,10 +168,6 @@ const etcd_tree = {
|
||||
// number of parity chunks, required for EC
|
||||
parity_chunks?: 1,
|
||||
pg_count: 100,
|
||||
// failure_domain = string | { string: int }
|
||||
// the second case specifies multiple failure domains. example:
|
||||
// { datacenter: 3, host: 2 } - means 3 datacenters with 2 hosts each, for EC 4+2
|
||||
// guarantees availability on outage of either 1 datacenter or 2 hosts
|
||||
failure_domain: 'host',
|
||||
max_osd_combinations: 10000,
|
||||
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
|
||||
@@ -176,6 +181,8 @@ const etcd_tree = {
|
||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// prefer to put primary on OSD with these tags
|
||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// scrub interval
|
||||
scrub_interval?: '30d',
|
||||
},
|
||||
...
|
||||
}, */
|
||||
@@ -271,7 +278,7 @@ const etcd_tree = {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead")[],
|
||||
"has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
|
||||
}
|
||||
}, */
|
||||
},
|
||||
@@ -293,6 +300,7 @@ const etcd_tree = {
|
||||
osd_sets: osd_num_t[][],
|
||||
all_peers: osd_num_t[],
|
||||
epoch: uint64_t,
|
||||
next_scrub: uint64_t,
|
||||
},
|
||||
}, */
|
||||
},
|
||||
@@ -383,6 +391,7 @@ class Mon
|
||||
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
|
||||
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
|
||||
this.signals_set = false;
|
||||
this.stat_time = Date.now();
|
||||
this.ws = null;
|
||||
this.ws_alive = false;
|
||||
this.ws_keepalive_timer = null;
|
||||
@@ -1031,32 +1040,6 @@ class Mon
|
||||
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
|
||||
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
|
||||
pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
|
||||
if (pool_cfg.failure_domain instanceof Object)
|
||||
{
|
||||
for (const key in pool_cfg.failure_domain)
|
||||
{
|
||||
const cnt = parseInt(pool_cfg.failure_domain[key]);
|
||||
if (!cnt || cnt <= 0)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' specifies invalid item count for failure domain \"'+key+'\"');
|
||||
return false;
|
||||
}
|
||||
if (key !== 'host' && key != 'osd' && !(key in this.config.placement_levels||{}))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' uses invalid failure domain \"'+key+'\"');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (pool_cfg.failure_domain !== 'host' && pool_cfg.failure_domain != 'osd' &&
|
||||
!(pool_cfg.failure_domain in this.config.placement_levels||{}))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' uses invalid failure domain \"'+pool_cfg.failure_domain+'\"');
|
||||
return false;
|
||||
}
|
||||
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
|
||||
if (!/^[1-9]\d*$/.exec(''+pool_id))
|
||||
{
|
||||
@@ -1142,23 +1125,27 @@ class Mon
|
||||
filter_osds_by_tags(orig_tree, flat_tree, tags)
|
||||
{
|
||||
if (!tags)
|
||||
return 1;
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (const tag of (tags instanceof Array ? tags : [ tags ]))
|
||||
{
|
||||
for (const item in flat_tree)
|
||||
for (const host in flat_tree)
|
||||
{
|
||||
if (flat_tree[item] instanceof Object)
|
||||
let found = 0;
|
||||
for (const osd in flat_tree[host])
|
||||
{
|
||||
if (!filter_osds_by_tags(orig_tree, flat_tree[item], tags))
|
||||
delete flat_tree[item];
|
||||
if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
|
||||
delete flat_tree[host][osd];
|
||||
else
|
||||
found++;
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
delete flat_tree[host];
|
||||
}
|
||||
else if (!orig_tree[item].tags || !orig_tree[item].tags[tag])
|
||||
delete flat_tree[item];
|
||||
}
|
||||
}
|
||||
for (const item in flat_tree)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
get_affinity_osds(pool_cfg, up_osds, osd_tree)
|
||||
@@ -1217,11 +1204,9 @@ class Mon
|
||||
{
|
||||
continue;
|
||||
}
|
||||
let pool_tree = osd_tree[pool_cfg.root_node || ''] || {};
|
||||
const failure_domains = pool_cfg.failure_domain instanceof Object
|
||||
? [ ...Object.keys(pool_cfg.failure_domain), 'osd' ]
|
||||
: [ pool_cfg.failure_domain, 'osd' ];
|
||||
pool_tree = LPOptimizer.extract_tree_levels(pool_tree, failure_domains, levels);
|
||||
let pool_tree = osd_tree[pool_cfg.root_node || ''];
|
||||
pool_tree = pool_tree ? pool_tree.children : [];
|
||||
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
|
||||
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
|
||||
// These are for the purpose of building history.osd_sets
|
||||
const real_prev_pgs = [];
|
||||
@@ -1248,9 +1233,6 @@ class Mon
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
hier_sizes: pool_cfg.failure_domain instanceof Object
|
||||
? [ ...Object.values(pool_cfg.failure_domain), 1 ]
|
||||
: null,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
ordered: pool_cfg.scheme != 'replicated',
|
||||
};
|
||||
@@ -1306,7 +1288,7 @@ class Mon
|
||||
} });
|
||||
}
|
||||
LPOptimizer.print_change_stats(optimize_result);
|
||||
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length); // FIXME requires hier support too
|
||||
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
|
||||
this.state.pool.stats[pool_id] = {
|
||||
used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
|
||||
total_raw_tb: optimize_result.space,
|
||||
@@ -1429,65 +1411,75 @@ class Mon
|
||||
}
|
||||
}
|
||||
|
||||
derive_osd_stats(st, prev)
|
||||
{
|
||||
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
|
||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
|
||||
{
|
||||
return diff;
|
||||
}
|
||||
const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
|
||||
for (const op in st.op_stats||{})
|
||||
{
|
||||
const pr = prev && prev.op_stats && prev.op_stats[op];
|
||||
let c = st.op_stats[op];
|
||||
c = { bytes: BigInt(c.bytes||0), usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
|
||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
if (n > 0)
|
||||
diff.op_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff, lat: us/n };
|
||||
}
|
||||
for (const op in st.subop_stats||{})
|
||||
{
|
||||
const pr = prev && prev.subop_stats && prev.subop_stats[op];
|
||||
let c = st.subop_stats[op];
|
||||
c = { usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
|
||||
const us = c.usec - BigInt(pr && pr.usec||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
if (n > 0)
|
||||
diff.subop_stats[op] = { ...c, iops: n*1000n/timediff, lat: us/n };
|
||||
}
|
||||
for (const op in st.recovery_stats||{})
|
||||
{
|
||||
const pr = prev && prev.recovery_stats && prev.recovery_stats[op];
|
||||
let c = st.recovery_stats[op];
|
||||
c = { bytes: BigInt(c.bytes||0), count: BigInt(c.count||0) };
|
||||
const b = c.bytes - BigInt(pr && pr.bytes||0);
|
||||
const n = c.count - BigInt(pr && pr.count||0);
|
||||
if (n > 0)
|
||||
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
sum_op_stats(timestamp, prev_stats)
|
||||
{
|
||||
const op_stats = {}, subop_stats = {}, recovery_stats = {};
|
||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
if (!prev_stats || prev_stats.timestamp >= timestamp)
|
||||
{
|
||||
return sum_diff;
|
||||
}
|
||||
const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
|
||||
// Sum derived values instead of deriving summed
|
||||
for (const osd in this.state.osd.stats)
|
||||
{
|
||||
const st = this.state.osd.stats[osd]||{};
|
||||
for (const op in st.op_stats||{})
|
||||
const derived = this.derive_osd_stats(this.state.osd.stats[osd],
|
||||
this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
|
||||
for (const type in derived)
|
||||
{
|
||||
op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
|
||||
op_stats[op].count += BigInt(st.op_stats[op].count||0);
|
||||
op_stats[op].usec += BigInt(st.op_stats[op].usec||0);
|
||||
op_stats[op].bytes += BigInt(st.op_stats[op].bytes||0);
|
||||
}
|
||||
for (const op in st.subop_stats||{})
|
||||
{
|
||||
subop_stats[op] = subop_stats[op] || { count: 0n, usec: 0n };
|
||||
subop_stats[op].count += BigInt(st.subop_stats[op].count||0);
|
||||
subop_stats[op].usec += BigInt(st.subop_stats[op].usec||0);
|
||||
}
|
||||
for (const op in st.recovery_stats||{})
|
||||
{
|
||||
recovery_stats[op] = recovery_stats[op] || { count: 0n, bytes: 0n };
|
||||
recovery_stats[op].count += BigInt(st.recovery_stats[op].count||0);
|
||||
recovery_stats[op].bytes += BigInt(st.recovery_stats[op].bytes||0);
|
||||
for (const op in derived[type])
|
||||
{
|
||||
for (const k in derived[type][op])
|
||||
{
|
||||
sum_diff[type][op] = sum_diff[type][op] || {};
|
||||
sum_diff[type][op][k] = (sum_diff[type][op][k] || 0n) + derived[type][op][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (prev_stats && prev_stats.timestamp >= timestamp)
|
||||
{
|
||||
prev_stats = null;
|
||||
}
|
||||
const tm = prev_stats ? BigInt(timestamp - prev_stats.timestamp) : 0;
|
||||
for (const op in op_stats)
|
||||
{
|
||||
if (prev_stats && prev_stats.op_stats && prev_stats.op_stats[op])
|
||||
{
|
||||
op_stats[op].bps = (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm;
|
||||
op_stats[op].iops = (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm;
|
||||
op_stats[op].lat = (op_stats[op].usec - prev_stats.op_stats[op].usec)
|
||||
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n);
|
||||
}
|
||||
}
|
||||
for (const op in subop_stats)
|
||||
{
|
||||
if (prev_stats && prev_stats.subop_stats && prev_stats.subop_stats[op])
|
||||
{
|
||||
subop_stats[op].iops = (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm;
|
||||
subop_stats[op].lat = (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
|
||||
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n);
|
||||
}
|
||||
}
|
||||
for (const op in recovery_stats)
|
||||
{
|
||||
if (prev_stats && prev_stats.recovery_stats && prev_stats.recovery_stats[op])
|
||||
{
|
||||
recovery_stats[op].bps = (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm;
|
||||
recovery_stats[op].iops = (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm;
|
||||
}
|
||||
}
|
||||
return { op_stats, subop_stats, recovery_stats };
|
||||
return sum_diff;
|
||||
}
|
||||
|
||||
sum_object_counts()
|
||||
@@ -1646,7 +1638,8 @@ class Mon
|
||||
this.prev_stats ? this.prev_stats.inode_stats : null,
|
||||
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
|
||||
);
|
||||
this.prev_stats = { timestamp, ...stats, inode_stats };
|
||||
this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
|
||||
this.stat_time = Date.now();
|
||||
stats.object_counts = object_counts;
|
||||
stats.object_bytes = object_bytes;
|
||||
stats = this.serialize_bigints(stats);
|
||||
@@ -1762,13 +1755,14 @@ class Mon
|
||||
else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
|
||||
{
|
||||
// Recheck OSD tree on OSD addition/deletion
|
||||
const osd_num = key_parts[2];
|
||||
if ((!old) != (!kv.value) || old && kv.value && old.size != kv.value.size)
|
||||
{
|
||||
this.schedule_recheck();
|
||||
}
|
||||
// Recheck PGs <osd_out_time> after last OSD statistics report
|
||||
this.schedule_next_recheck_at(
|
||||
!this.state.osd.stats[key[2]] ? 0 : this.state.osd.stats[key[2]].time+this.config.osd_out_time
|
||||
!this.state.osd.stats[osd_num] ? 0 : this.state.osd.stats[osd_num].time+this.config.osd_out_time
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@@ -36,7 +36,7 @@ const crush_tree = [
|
||||
] },
|
||||
];
|
||||
|
||||
const osd_tree = LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {});
|
||||
const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
|
||||
console.log(osd_tree);
|
||||
|
||||
async function run()
|
||||
@@ -47,32 +47,32 @@ async function run()
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 0);
|
||||
console.log('\nAdding 1st failure domain:');
|
||||
cur_tree['l1_1'] = osd_tree['l1_1'];
|
||||
cur_tree['dom1'] = osd_tree['dom1'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 12 && res.total_space == 12);
|
||||
console.log('\nAdding 2nd failure domain:');
|
||||
cur_tree['l1_2'] = osd_tree['l1_2'];
|
||||
cur_tree['dom2'] = osd_tree['dom2'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 24 && res.total_space == 24);
|
||||
console.log('\nAdding 3rd failure domain:');
|
||||
cur_tree['l1_3'] = osd_tree['l1_3'];
|
||||
cur_tree['dom3'] = osd_tree['dom3'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 36 && res.total_space == 36);
|
||||
console.log('\nRemoving 3rd failure domain:');
|
||||
delete cur_tree['l1_3'];
|
||||
delete cur_tree['dom3'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 24 && res.total_space == 24);
|
||||
console.log('\nRemoving 2nd failure domain:');
|
||||
delete cur_tree['l1_2'];
|
||||
delete cur_tree['dom2'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 12 && res.total_space == 12);
|
||||
console.log('\nRemoving 1st failure domain:');
|
||||
delete cur_tree['l1_1'];
|
||||
delete cur_tree['dom1'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 0);
|
||||
|
@@ -108,11 +108,7 @@ async function run()
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
|
||||
console.log('\n256 PGs, size=3, failure domain=rack');
|
||||
res = await LPOptimizer.optimize_initial({
|
||||
osd_tree: LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {}),
|
||||
pg_size: 3,
|
||||
pg_count: 256,
|
||||
});
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
}
|
||||
|
||||
|
@@ -1,56 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
100: { 110: { 111: 1, 112: 1 }, 120: { 121: 1, 122: 1 } },
|
||||
200: { 210: { 211: 1, 212: 1 }, 220: { 221: 1, 222: 1 } },
|
||||
300: { 310: { 311: 1, 312: 1 }, 320: { 321: 1, 322: 1 } },
|
||||
400: { 410: { 411: 1, 412: 1 }, 420: { 421: 1, 422: 1 } },
|
||||
500: { 510: { 511: 1, 512: 1 }, 520: { 521: 1, 522: 1 } },
|
||||
};
|
||||
|
||||
const osd_tree2 = {
|
||||
100: { 111: 1, 112: 1, 121: 1, 122: 1 },
|
||||
200: { 211: 1, 212: 1, 221: 1, 222: 1 },
|
||||
300: { 311: 1, 312: 1, 321: 1, 322: 1 },
|
||||
400: { 411: 1, 412: 1, 421: 1, 422: 1 },
|
||||
500: { 511: 1, 512: 1, 521: 1, 522: 1 },
|
||||
};
|
||||
|
||||
const osd_tree3 = {
|
||||
100: { 111: 1, 112: 1, 121: 1, 122: 1 },
|
||||
200: { 211: 1, 212: 1, 221: 1, 222: 1 },
|
||||
300: { 311: 1, 312: 1, 321: 1, 322: 1 },
|
||||
400: { 411: 1, 412: 1, 421: 1, 422: 1 },
|
||||
500: { 511: 1 },
|
||||
};
|
||||
|
||||
async function run()
|
||||
{
|
||||
let r;
|
||||
console.log(r = LPOptimizer.random_hier_combinations(osd_tree, [ 3, 2, 1 ], 10000, false, true));
|
||||
console.log(r = LPOptimizer.random_hier_combinations(osd_tree2, [ 3, 2 ], 0, false, true));
|
||||
// Will contain 'Z':
|
||||
console.log(r = LPOptimizer.random_combinations(osd_tree2, 6, 0, true));
|
||||
console.log(r = LPOptimizer.extract_tree_levels(
|
||||
{ level: 'dc', children: [
|
||||
{ level: 'rack', children: [
|
||||
{ level: 'host', children: [
|
||||
{ level: 'osd', id: 'OSD5', size: 10 },
|
||||
] },
|
||||
] },
|
||||
{ level: 'osd', id: 'OSD10', size: 10 },
|
||||
] },
|
||||
[ 'rack', 'osd' ],
|
||||
{ dc: 1, rack: 2, host: 3, osd: 4 }
|
||||
));
|
||||
if (JSON.stringify(r) != '{"rack1":{"OSD5":10},"rack2":{"OSD10":10}}')
|
||||
throw new Error('extract_tree_levels failed');
|
||||
// should not contain Z:
|
||||
console.log(r = LPOptimizer.random_hier_combinations(osd_tree3, [ 3, 2 ], 0, false, true));
|
||||
console.log('OK');
|
||||
}
|
||||
|
||||
run().catch(console.error);
|
@@ -388,8 +388,6 @@ sub unmap_volume
|
||||
my ($class, $storeid, $scfg, $volname, $snapname) = @_;
|
||||
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
|
||||
|
||||
return 1 if !$scfg->{vitastor_nbd};
|
||||
|
||||
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
|
||||
$name .= '@'.$snapname if $snapname;
|
||||
|
||||
@@ -413,7 +411,7 @@ sub activate_volume
|
||||
sub deactivate_volume
|
||||
{
|
||||
my ($class, $storeid, $scfg, $volname, $snapname, $cache) = @_;
|
||||
$class->unmap_volume($storeid, $scfg, $volname, $snapname);
|
||||
$class->unmap_volume($storeid, $scfg, $volname, $snapname) if $scfg->{vitastor_nbd};
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.8.9'
|
||||
VERSION = '0.9.2'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
644
patches/libvirt-9.0-vitastor.diff
Normal file
644
patches/libvirt-9.0-vitastor.diff
Normal file
@@ -0,0 +1,644 @@
|
||||
commit e6f935157944279c2c0634915c3c00feeec748c9
|
||||
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Date: Mon Jun 19 00:58:19 2023 +0300
|
||||
|
||||
Add Vitastor support
|
||||
|
||||
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
|
||||
index aaad4a3..5f5daa8 100644
|
||||
--- a/include/libvirt/libvirt-storage.h
|
||||
+++ b/include/libvirt/libvirt-storage.h
|
||||
@@ -326,6 +326,7 @@ typedef enum {
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
|
||||
} virConnectListAllStoragePoolsFlags;
|
||||
|
||||
int virConnectListAllStoragePools(virConnectPtr conn,
|
||||
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
|
||||
index 45965fa..b7c23d3 100644
|
||||
--- a/src/conf/domain_conf.c
|
||||
+++ b/src/conf/domain_conf.c
|
||||
@@ -7103,7 +7103,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
|
||||
src->configFile = virXPathString("string(./config/@file)", ctxt);
|
||||
|
||||
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
|
||||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
|
||||
src->query = virXMLPropString(node, "query");
|
||||
|
||||
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
|
||||
@@ -30121,6 +30122,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_POOL_MPATH:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
|
||||
index 5a9bf20..05058b8 100644
|
||||
--- a/src/conf/domain_validate.c
|
||||
+++ b/src/conf/domain_validate.c
|
||||
@@ -494,6 +494,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
}
|
||||
}
|
||||
|
||||
- /* internal snapshots and config files are currently supported only with rbd: */
|
||||
+ /* internal snapshots are currently supported only with rbd: */
|
||||
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
|
||||
if (src->snapshot) {
|
||||
@@ -550,11 +551,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
"only with 'rbd' disks"));
|
||||
return -1;
|
||||
}
|
||||
-
|
||||
+ }
|
||||
+ /* config files are currently supported only with rbd and vitastor: */
|
||||
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
|
||||
if (src->configFile) {
|
||||
virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
_("<config> element is currently supported "
|
||||
- "only with 'rbd' disks"));
|
||||
+ "only with 'rbd' and 'vitastor' disks"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
|
||||
index 6cb0a20..8bf7de9 100644
|
||||
--- a/src/conf/schemas/domaincommon.rng
|
||||
+++ b/src/conf/schemas/domaincommon.rng
|
||||
@@ -1972,6 +1972,35 @@
|
||||
</element>
|
||||
</define>
|
||||
|
||||
+ <define name="diskSourceNetworkProtocolVitastor">
|
||||
+ <element name="source">
|
||||
+ <interleave>
|
||||
+ <attribute name="protocol">
|
||||
+ <value>vitastor</value>
|
||||
+ </attribute>
|
||||
+ <ref name="diskSourceCommon"/>
|
||||
+ <optional>
|
||||
+ <attribute name="name"/>
|
||||
+ </optional>
|
||||
+ <optional>
|
||||
+ <attribute name="query"/>
|
||||
+ </optional>
|
||||
+ <zeroOrMore>
|
||||
+ <ref name="diskSourceNetworkHost"/>
|
||||
+ </zeroOrMore>
|
||||
+ <optional>
|
||||
+ <element name="config">
|
||||
+ <attribute name="file">
|
||||
+ <ref name="absFilePath"/>
|
||||
+ </attribute>
|
||||
+ <empty/>
|
||||
+ </element>
|
||||
+ </optional>
|
||||
+ <empty/>
|
||||
+ </interleave>
|
||||
+ </element>
|
||||
+ </define>
|
||||
+
|
||||
<define name="diskSourceNetworkProtocolISCSI">
|
||||
<element name="source">
|
||||
<attribute name="protocol">
|
||||
@@ -2264,6 +2293,7 @@
|
||||
<ref name="diskSourceNetworkProtocolSimple"/>
|
||||
<ref name="diskSourceNetworkProtocolVxHS"/>
|
||||
<ref name="diskSourceNetworkProtocolNFS"/>
|
||||
+ <ref name="diskSourceNetworkProtocolVitastor"/>
|
||||
</choice>
|
||||
</define>
|
||||
|
||||
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
|
||||
index f5a9636..8339bc4 100644
|
||||
--- a/src/conf/storage_conf.c
|
||||
+++ b/src/conf/storage_conf.c
|
||||
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
|
||||
"logical", "disk", "iscsi",
|
||||
"iscsi-direct", "scsi", "mpath",
|
||||
"rbd", "sheepdog", "gluster",
|
||||
- "zfs", "vstorage",
|
||||
+ "zfs", "vstorage", "vitastor",
|
||||
);
|
||||
|
||||
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
|
||||
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
|
||||
.formatToString = virStorageFileFormatTypeToString,
|
||||
}
|
||||
},
|
||||
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
|
||||
+ .poolOptions = {
|
||||
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NAME),
|
||||
+ },
|
||||
+ .volOptions = {
|
||||
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
|
||||
+ .formatFromString = virStorageVolumeFormatFromString,
|
||||
+ .formatToString = virStorageFileFormatTypeToString,
|
||||
+ }
|
||||
+ },
|
||||
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
|
||||
.poolOptions = {
|
||||
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
@@ -542,6 +554,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
|
||||
_("element 'name' is mandatory for RBD pool"));
|
||||
return -1;
|
||||
}
|
||||
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
|
||||
+ virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
+ _("element 'name' is mandatory for Vitastor pool"));
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
if (options->formatFromString) {
|
||||
g_autofree char *format = NULL;
|
||||
@@ -1132,6 +1149,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
|
||||
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
|
||||
* files, so they don't have a target */
|
||||
if (def->type != VIR_STORAGE_POOL_RBD &&
|
||||
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
|
||||
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
|
||||
def->type != VIR_STORAGE_POOL_GLUSTER &&
|
||||
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
|
||||
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
|
||||
index fc67957..720c07e 100644
|
||||
--- a/src/conf/storage_conf.h
|
||||
+++ b/src/conf/storage_conf.h
|
||||
@@ -103,6 +103,7 @@ typedef enum {
|
||||
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
|
||||
VIR_STORAGE_POOL_ZFS, /* ZFS */
|
||||
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
|
||||
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
|
||||
|
||||
VIR_STORAGE_POOL_LAST,
|
||||
} virStoragePoolType;
|
||||
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
|
||||
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
|
||||
index cecd7e8..d7b79a4 100644
|
||||
--- a/src/conf/storage_source_conf.c
|
||||
+++ b/src/conf/storage_source_conf.c
|
||||
@@ -87,6 +87,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
|
||||
"ssh",
|
||||
"vxhs",
|
||||
"nfs",
|
||||
+ "vitastor",
|
||||
);
|
||||
|
||||
|
||||
@@ -1286,6 +1287,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
return 24007;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
/* we don't provide a default for RBD */
|
||||
return 0;
|
||||
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
|
||||
index 14a6825..eb4acac 100644
|
||||
--- a/src/conf/storage_source_conf.h
|
||||
+++ b/src/conf/storage_source_conf.h
|
||||
@@ -128,6 +128,7 @@ typedef enum {
|
||||
VIR_STORAGE_NET_PROTOCOL_SSH,
|
||||
VIR_STORAGE_NET_PROTOCOL_VXHS,
|
||||
VIR_STORAGE_NET_PROTOCOL_NFS,
|
||||
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
|
||||
|
||||
VIR_STORAGE_NET_PROTOCOL_LAST
|
||||
} virStorageNetProtocol;
|
||||
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
|
||||
index e6c187e..035b423 100644
|
||||
--- a/src/conf/virstorageobj.c
|
||||
+++ b/src/conf/virstorageobj.c
|
||||
@@ -1433,6 +1433,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
|
||||
return 1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
@@ -1918,6 +1919,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
|
||||
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
|
||||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
|
||||
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
|
||||
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
|
||||
index 8490034..ab2cdaa 100644
|
||||
--- a/src/libvirt-storage.c
|
||||
+++ b/src/libvirt-storage.c
|
||||
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
|
||||
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
|
||||
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
|
||||
index 17ac880..59711b5 100644
|
||||
--- a/src/libxl/libxl_conf.c
|
||||
+++ b/src/libxl/libxl_conf.c
|
||||
@@ -970,6 +970,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
|
||||
index 6919325..55ffc32 100644
|
||||
--- a/src/libxl/xen_xl.c
|
||||
+++ b/src/libxl/xen_xl.c
|
||||
@@ -1445,6 +1445,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
|
||||
index e865aa1..40162af 100644
|
||||
--- a/src/qemu/qemu_block.c
|
||||
+++ b/src/qemu/qemu_block.c
|
||||
@@ -604,6 +604,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
|
||||
}
|
||||
|
||||
|
||||
+static virJSONValue *
|
||||
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
|
||||
+{
|
||||
+ virJSONValue *ret = NULL;
|
||||
+ virStorageNetHostDef *host;
|
||||
+ size_t i;
|
||||
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
|
||||
+ g_autofree char *etcd = NULL;
|
||||
+
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ host = src->hosts + i;
|
||||
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
|
||||
+ return NULL;
|
||||
+ }
|
||||
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
|
||||
+ }
|
||||
+ if (src->nhosts > 0) {
|
||||
+ etcd = virBufferContentAndReset(&buf);
|
||||
+ }
|
||||
+
|
||||
+ if (virJSONValueObjectAdd(&ret,
|
||||
+ "S:etcd-host", etcd,
|
||||
+ "S:etcd-prefix", src->query,
|
||||
+ "S:config-path", src->configFile,
|
||||
+ "s:image", src->path,
|
||||
+ NULL) < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static virJSONValue *
|
||||
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
|
||||
{
|
||||
@@ -917,6 +949,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return NULL;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
@@ -1860,6 +1898,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
@@ -2242,6 +2281,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
|
||||
index 2eb5653..60ee82d 100644
|
||||
--- a/src/qemu/qemu_domain.c
|
||||
+++ b/src/qemu/qemu_domain.c
|
||||
@@ -4958,7 +4958,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
|
||||
if (src->query &&
|
||||
(actualType != VIR_STORAGE_TYPE_NETWORK ||
|
||||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
|
||||
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
|
||||
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
||||
_("query is supported only with HTTP(S) protocols"));
|
||||
return -1;
|
||||
@@ -10129,6 +10130,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
|
||||
break;
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
|
||||
index b841680..a6be771 100644
|
||||
--- a/src/qemu/qemu_snapshot.c
|
||||
+++ b/src/qemu/qemu_snapshot.c
|
||||
@@ -373,6 +373,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
@@ -578,6 +579,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
|
||||
index d90c1c9..e853457 100644
|
||||
--- a/src/storage/storage_driver.c
|
||||
+++ b/src/storage/storage_driver.c
|
||||
@@ -1627,6 +1627,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
|
||||
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
|
||||
index e48ae72..2017ccc 100644
|
||||
--- a/src/storage_file/storage_source_backingstore.c
|
||||
+++ b/src/storage_file/storage_source_backingstore.c
|
||||
@@ -284,6 +284,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
|
||||
}
|
||||
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseVitastorColonString(const char *colonstr,
|
||||
+ virStorageSource *src)
|
||||
+{
|
||||
+ char *p, *e, *next;
|
||||
+ g_autofree char *options = NULL;
|
||||
+
|
||||
+ /* optionally skip the "vitastor:" prefix if provided */
|
||||
+ if (STRPREFIX(colonstr, "vitastor:"))
|
||||
+ colonstr += strlen("vitastor:");
|
||||
+
|
||||
+ options = g_strdup(colonstr);
|
||||
+
|
||||
+ p = options;
|
||||
+ while (*p) {
|
||||
+ /* find : delimiter or end of string */
|
||||
+ for (e = p; *e && *e != ':'; ++e) {
|
||||
+ if (*e == '\\') {
|
||||
+ e++;
|
||||
+ if (*e == '\0')
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if (*e == '\0') {
|
||||
+ next = e; /* last kv pair */
|
||||
+ } else {
|
||||
+ next = e + 1;
|
||||
+ *e = '\0';
|
||||
+ }
|
||||
+
|
||||
+ if (STRPREFIX(p, "image=")) {
|
||||
+ src->path = g_strdup(p + strlen("image="));
|
||||
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
|
||||
+ src->query = g_strdup(p + strlen("etcd-prefix="));
|
||||
+ } else if (STRPREFIX(p, "config-path=")) {
|
||||
+ src->configFile = g_strdup(p + strlen("config-path="));
|
||||
+ } else if (STRPREFIX(p, "etcd-host=")) {
|
||||
+ char *h, *sep;
|
||||
+
|
||||
+ h = p + strlen("etcd-host=");
|
||||
+ while (h < e) {
|
||||
+ for (sep = h; sep < e; ++sep) {
|
||||
+ if (*sep == '\\' && (sep[1] == ',' ||
|
||||
+ sep[1] == ';' ||
|
||||
+ sep[1] == ' ')) {
|
||||
+ *sep = '\0';
|
||||
+ sep += 2;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (virStorageSourceRBDAddHost(src, h) < 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ h = sep;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p = next;
|
||||
+ }
|
||||
+
|
||||
+ if (!src->path) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseNBDColonString(const char *nbdstr,
|
||||
virStorageSource *src)
|
||||
@@ -396,6 +465,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
@@ -984,6 +1058,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
|
||||
+ virJSONValue *json,
|
||||
+ const char *jsonstr G_GNUC_UNUSED,
|
||||
+ int opaque G_GNUC_UNUSED)
|
||||
+{
|
||||
+ const char *filename;
|
||||
+ const char *image = virJSONValueObjectGetString(json, "image");
|
||||
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
|
||||
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
|
||||
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
|
||||
+ size_t nservers;
|
||||
+ size_t i;
|
||||
+
|
||||
+ src->type = VIR_STORAGE_TYPE_NETWORK;
|
||||
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
|
||||
+
|
||||
+ /* legacy syntax passed via 'filename' option */
|
||||
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
|
||||
+ return virStorageSourceParseVitastorColonString(filename, src);
|
||||
+
|
||||
+ if (!image) {
|
||||
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
|
||||
+ _("missing image name in Vitastor backing volume "
|
||||
+ "JSON specification"));
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ src->path = g_strdup(image);
|
||||
+ src->configFile = g_strdup(conf);
|
||||
+ src->query = g_strdup(etcd_prefix);
|
||||
+
|
||||
+ if (servers) {
|
||||
+ nservers = virJSONValueArraySize(servers);
|
||||
+
|
||||
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
|
||||
+ src->nhosts = nservers;
|
||||
+
|
||||
+ for (i = 0; i < nservers; i++) {
|
||||
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
|
||||
+ virJSONValueArrayGet(servers, i)) < 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
|
||||
virJSONValue *json,
|
||||
@@ -1162,6 +1284,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
|
||||
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
|
||||
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
|
||||
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
|
||||
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
|
||||
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
|
||||
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
|
||||
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
|
||||
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
|
||||
index bd6f063..cce34e1 100644
|
||||
--- a/src/test/test_driver.c
|
||||
+++ b/src/test/test_driver.c
|
||||
@@ -7338,6 +7338,7 @@ testStorageVolumeTypeForPool(int pooltype)
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
return VIR_STORAGE_VOL_NETWORK;
|
||||
case VIR_STORAGE_POOL_LOGICAL:
|
||||
case VIR_STORAGE_POOL_DISK:
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
index eee75af..8bd0a57 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='no'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
index 805950a..852df0d 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='yes'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
|
||||
index e8e40d6..db55fe5 100644
|
||||
--- a/tests/storagepoolxml2argvtest.c
|
||||
+++ b/tests/storagepoolxml2argvtest.c
|
||||
@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
default:
|
||||
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
|
||||
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
|
||||
index 8a98c6a..4b1bbd4 100644
|
||||
--- a/tools/virsh-pool.c
|
||||
+++ b/tools/virsh-pool.c
|
||||
@@ -1221,6 +1221,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
|
||||
break;
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
|
||||
+ break;
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
}
|
190
patches/pve-qemu-8.0-vitastor.patch
Normal file
190
patches/pve-qemu-8.0-vitastor.patch
Normal file
@@ -0,0 +1,190 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index 382bec0e7d..af6207dbce 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -114,6 +114,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index c44d05a13f..ebedb42843 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -1028,6 +1028,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1882,6 +1902,7 @@ endif
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
@@ -4020,6 +4041,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index fc9447d267..c4ac55c283 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -173,6 +173,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index c05ad0c07e..f5eb701604 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -3308,7 +3308,7 @@
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
'pbs',
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4338,6 +4338,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4787,6 +4809,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5187,6 +5210,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5385,6 +5419,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
index 6e8983f39c..1b0b9fcf3e 100755
|
||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -32,7 +32,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -179,6 +179,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 009fab1515..95914e6ebc 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -144,6 +144,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' sdl SDL user interface'
|
||||
@@ -392,6 +393,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
@@ -24,4 +24,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.8.9/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.9$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.9.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.2$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.9.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.9.2.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.9
|
||||
Version: 0.9.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.9.el7.tar.gz
|
||||
Source0: vitastor-0.9.2.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.9.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.9.2.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.9
|
||||
Version: 0.9.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.9.el8.tar.gz
|
||||
Source0: vitastor-0.9.2.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -18,7 +18,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.9.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.9.2.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.9
|
||||
Version: 0.9.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.9.el9.tar.gz
|
||||
Source0: vitastor-0.9.2.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -73,7 +73,7 @@ Vitastor library headers for development.
|
||||
Summary: Vitastor - fio drivers
|
||||
Group: Development/Libraries
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: fio = 3.27-7.el9
|
||||
Requires: fio = 3.27-8.el9
|
||||
|
||||
|
||||
%description -n vitastor-fio
|
||||
|
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.8.9")
|
||||
add_definitions(-DVERSION="0.9.2")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||
add_executable(vitastor-osd
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_primary_describe.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-osd
|
||||
vitastor_common
|
||||
@@ -141,6 +141,8 @@ add_library(vitastor_client SHARED
|
||||
cli_common.cpp
|
||||
cli_alloc_osd.cpp
|
||||
cli_status.cpp
|
||||
cli_describe.cpp
|
||||
cli_fix.cpp
|
||||
cli_df.cpp
|
||||
cli_ls.cpp
|
||||
cli_create.cpp
|
||||
@@ -299,7 +301,7 @@ add_executable(test_cluster_client
|
||||
EXCLUDE_FROM_ALL
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||
|
@@ -73,7 +73,10 @@ Input:
|
||||
write request is copied into the metadata area bitwise and stored there.
|
||||
|
||||
Output:
|
||||
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
|
||||
- retval = number of bytes actually read/written or negative error number
|
||||
-EINVAL = invalid input parameters
|
||||
-ENOENT = requested object/version does not exist for reads
|
||||
-ENOSPC = no space left in the store for writes
|
||||
- version = the version actually read or written
|
||||
|
||||
## BS_OP_DELETE
|
||||
@@ -122,11 +125,14 @@ Output:
|
||||
Get a list of all objects in this Blockstore.
|
||||
|
||||
Input:
|
||||
- oid.stripe = PG alignment
|
||||
- len = PG count or 0 to list all objects
|
||||
- offset = PG number
|
||||
- oid.inode = min inode number or 0 to list all inodes
|
||||
- version = max inode number or 0 to list all inodes
|
||||
- pg_alignment = PG alignment
|
||||
- pg_count = PG count or 0 to list all objects
|
||||
- pg_number = PG number
|
||||
- list_stable_limit = max number of clean objects in the reply
|
||||
it's guaranteed that dirty objects are returned from the same interval,
|
||||
i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
|
||||
- min_oid = min inode/stripe or 0 to list all objects
|
||||
- max_oid = max inode/stripe or 0 to list all objects
|
||||
|
||||
Output:
|
||||
- retval = total obj_ver_id count
|
||||
@@ -143,10 +149,27 @@ struct blockstore_op_t
|
||||
uint64_t opcode;
|
||||
// finish callback
|
||||
std::function<void (blockstore_op_t*)> callback;
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
union __attribute__((__packed__))
|
||||
{
|
||||
// R/W
|
||||
struct __attribute__((__packed__))
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
};
|
||||
// List
|
||||
struct __attribute__((__packed__))
|
||||
{
|
||||
object_id min_oid;
|
||||
object_id max_oid;
|
||||
uint32_t pg_alignment;
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_number;
|
||||
uint32_t list_stable_limit;
|
||||
};
|
||||
};
|
||||
void *buf;
|
||||
void *bitmap;
|
||||
int retval;
|
||||
|
@@ -536,14 +536,27 @@ resume_1:
|
||||
return false;
|
||||
}
|
||||
// zero out old metadata entry
|
||||
{
|
||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
|
||||
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
||||
{
|
||||
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %lu (%lx:%lx v%lu) as old location of %lx:%lx\n",
|
||||
old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
|
||||
old_entry->version, cur.oid.inode, cur.oid.stripe);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||
await_sqe(15);
|
||||
data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
|
||||
);
|
||||
wait_count++;
|
||||
if (meta_old.sector != meta_new.sector)
|
||||
{
|
||||
await_sqe(15);
|
||||
data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
|
||||
);
|
||||
wait_count++;
|
||||
}
|
||||
}
|
||||
if (has_delete)
|
||||
{
|
||||
@@ -662,53 +675,58 @@ resume_1:
|
||||
return false;
|
||||
}
|
||||
flusher->trimming = true;
|
||||
// First update journal "superblock" and only then update <used_start> in memory
|
||||
await_sqe(12);
|
||||
*((journal_entry_start*)flusher->journal_superblock) = {
|
||||
.crc32 = 0,
|
||||
.magic = JOURNAL_MAGIC,
|
||||
.type = JE_START,
|
||||
.size = sizeof(journal_entry_start),
|
||||
.reserved = 0,
|
||||
.journal_start = new_trim_pos,
|
||||
.version = JOURNAL_VERSION,
|
||||
};
|
||||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
||||
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||
wait_count++;
|
||||
resume_13:
|
||||
if (wait_count > 0)
|
||||
// Recheck the position with the "lock" taken
|
||||
new_trim_pos = bs->journal.get_trim_pos();
|
||||
if (new_trim_pos != bs->journal.used_start)
|
||||
{
|
||||
wait_state = 13;
|
||||
return false;
|
||||
}
|
||||
if (!bs->disable_journal_fsync)
|
||||
{
|
||||
await_sqe(20);
|
||||
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
// First update journal "superblock" and only then update <used_start> in memory
|
||||
await_sqe(12);
|
||||
*((journal_entry_start*)flusher->journal_superblock) = {
|
||||
.crc32 = 0,
|
||||
.magic = JOURNAL_MAGIC,
|
||||
.type = JE_START,
|
||||
.size = sizeof(journal_entry_start),
|
||||
.reserved = 0,
|
||||
.journal_start = new_trim_pos,
|
||||
.version = JOURNAL_VERSION,
|
||||
};
|
||||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
||||
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
resume_21:
|
||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||
wait_count++;
|
||||
resume_13:
|
||||
if (wait_count > 0)
|
||||
{
|
||||
wait_state = 21;
|
||||
wait_state = 13;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
bs->journal.used_start = new_trim_pos;
|
||||
if (!bs->disable_journal_fsync)
|
||||
{
|
||||
await_sqe(20);
|
||||
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = simple_callback_w;
|
||||
resume_21:
|
||||
if (wait_count > 0)
|
||||
{
|
||||
wait_state = 21;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
bs->journal.used_start = new_trim_pos;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
|
||||
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
|
||||
#endif
|
||||
if (bs->journal.flush_journal && !flusher->flush_queue.size())
|
||||
{
|
||||
assert(bs->journal.used_start == bs->journal.next_free);
|
||||
printf("Journal flushed\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
flusher->trimming = false;
|
||||
}
|
||||
if (bs->journal.flush_journal && !flusher->flush_queue.size())
|
||||
{
|
||||
assert(bs->journal.used_start == bs->journal.next_free);
|
||||
printf("Journal flushed\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
// All done
|
||||
flusher->active_flushers--;
|
||||
|
@@ -462,11 +462,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->offset+1;
|
||||
uint32_t pg_count = op->len;
|
||||
uint64_t pg_stripe_size = op->oid.stripe;
|
||||
uint64_t min_inode = op->oid.inode;
|
||||
uint64_t max_inode = op->version;
|
||||
uint32_t list_pg = op->pg_number+1;
|
||||
uint32_t pg_count = op->pg_count;
|
||||
uint64_t pg_stripe_size = op->pg_alignment;
|
||||
uint64_t min_inode = op->min_oid.inode;
|
||||
uint64_t max_inode = op->max_oid.inode;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
|
||||
{
|
||||
@@ -513,7 +513,13 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
else
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
stable_alloc = op->list_stable_limit;
|
||||
if (stable_alloc > 1024*1024)
|
||||
stable_alloc = 1024*1024;
|
||||
}
|
||||
if (stable_alloc < 32768)
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
@@ -524,22 +530,22 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
auto max_oid = op->max_oid;
|
||||
bool limited = false;
|
||||
pool_pg_id_t last_shard_id = 0;
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
clean_it = clean_db.lower_bound({
|
||||
.inode = min_inode,
|
||||
.stripe = 0,
|
||||
});
|
||||
clean_end = clean_db.upper_bound({
|
||||
.inode = max_inode,
|
||||
.stripe = UINT64_MAX,
|
||||
});
|
||||
clean_it = clean_db.lower_bound(op->min_oid);
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
clean_end = clean_db.upper_bound(max_oid);
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
@@ -558,11 +564,29 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
if (!limited)
|
||||
{
|
||||
limited = true;
|
||||
max_oid = stable[stable_count-1].oid;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
// To maintain the order, we have to include objects in the same range from other shards
|
||||
if (last_shard_id != 0 && last_shard_id != shard_it->first)
|
||||
std::sort(stable, stable+stable_count);
|
||||
if (stable_count > op->list_stable_limit)
|
||||
stable_count = op->list_stable_limit;
|
||||
}
|
||||
last_shard_id = shard_it->first;
|
||||
}
|
||||
if (first_shard != last_shard)
|
||||
if (op->list_stable_limit == 0 && first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries
|
||||
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
@@ -571,20 +595,17 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
obj_ver_id *unstable = NULL;
|
||||
{
|
||||
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
dirty_it = dirty_db.lower_bound({
|
||||
.oid = {
|
||||
.inode = min_inode,
|
||||
.stripe = 0,
|
||||
},
|
||||
.oid = op->min_oid,
|
||||
.version = 0,
|
||||
});
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
dirty_end = dirty_db.upper_bound({
|
||||
.oid = {
|
||||
.inode = max_inode,
|
||||
.stripe = UINT64_MAX,
|
||||
},
|
||||
.oid = max_oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
}
|
||||
@@ -628,6 +649,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
stable[stable_count++] = dirty_it->first;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
// Stop here
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@@ -236,14 +236,6 @@ journal_t::~journal_t()
|
||||
uint64_t journal_t::get_trim_pos()
|
||||
{
|
||||
auto journal_used_it = used_sectors.lower_bound(used_start);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
||||
);
|
||||
#endif
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is cleared to its end, restart from the beginning
|
||||
@@ -256,12 +248,26 @@ uint64_t journal_t::get_trim_pos()
|
||||
else
|
||||
{
|
||||
// next_free does not need updating during trim
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
#endif
|
||||
return journal_used_it->first;
|
||||
}
|
||||
}
|
||||
else if (journal_used_it->first > used_start)
|
||||
{
|
||||
// Journal is cleared up to <journal_used_it>
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
#endif
|
||||
return journal_used_it->first;
|
||||
}
|
||||
// Can't trim journal
|
||||
|
@@ -124,10 +124,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
|
||||
if (!clean_found && !dirty_found)
|
||||
{
|
||||
// region is not allocated - return zeroes
|
||||
memset(read_op->buf, 0, read_op->len);
|
||||
read_op->version = 0;
|
||||
read_op->retval = read_op->len;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
@@ -140,14 +138,16 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
dirty_entry& dirty = dirty_it->second;
|
||||
bool version_ok = !IS_IN_FLIGHT(dirty.state) && read_op->version >= dirty_it->first.version;
|
||||
if (IS_SYNCED(dirty.state))
|
||||
{
|
||||
if (!version_ok && read_op->version != 0)
|
||||
read_op->version = dirty_it->first.version;
|
||||
version_ok = true;
|
||||
}
|
||||
if (version_ok)
|
||||
{
|
||||
if (IS_DELETE(dirty.state))
|
||||
{
|
||||
assert(!result_version);
|
||||
read_op->version = 0;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
if (!result_version)
|
||||
{
|
||||
result_version = dirty_it->first.version;
|
||||
@@ -234,12 +234,19 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (fulfilled < read_op->len)
|
||||
if (!result_version)
|
||||
{
|
||||
// fill remaining parts with zeroes
|
||||
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
|
||||
// May happen if there are entries in dirty_db but all of them are !version_ok
|
||||
read_op->version = 0;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
if (fulfilled < read_op->len)
|
||||
{
|
||||
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
|
||||
assert(fulfilled == read_op->len);
|
||||
}
|
||||
assert(fulfilled == read_op->len);
|
||||
read_op->version = result_version;
|
||||
if (!PRIV(read_op)->pending_ops)
|
||||
{
|
||||
|
@@ -179,7 +179,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
{
|
||||
object_id oid = dirty_it->first.oid;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
printf("Unblock writes-after-delete %lx:%lx v%lu\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
dirty_it = dirty_end;
|
||||
// Unblock operations blocked by delete flushing
|
||||
|
@@ -103,7 +103,7 @@ blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->buf = NULL;
|
||||
sync_op->callback = [this](blockstore_op_t *sync_op)
|
||||
sync_op->callback = [](blockstore_op_t *sync_op)
|
||||
{
|
||||
delete sync_op;
|
||||
};
|
||||
@@ -244,7 +244,7 @@ int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_
|
||||
// Make a wrapped callback
|
||||
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
|
||||
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
|
||||
auto cb = [this, op, good_items = good_vers.items,
|
||||
auto cb = [op, good_items = good_vers.items,
|
||||
bad_items = bad_vers.items, split_op_counter,
|
||||
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
|
||||
{
|
||||
@@ -458,6 +458,16 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
big_to_flush++;
|
||||
}
|
||||
}
|
||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// mark_stable should never be called for in-flight or submitted writes
|
||||
printf(
|
||||
"BUG: Attempt to mark_stable object %lx:%lx v%lu state of which is %x\n",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
dirty_it->second.state
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
|
||||
IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
|
@@ -6,7 +6,7 @@
|
||||
bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
{
|
||||
// Check or assign version number
|
||||
bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
|
||||
bool found = false, deleted = false, unsynced = false, is_del = (op->opcode == BS_OP_DELETE);
|
||||
bool wait_big = false, wait_del = false;
|
||||
void *bmp = NULL;
|
||||
uint64_t version = 1;
|
||||
@@ -26,6 +26,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
found = true;
|
||||
version = dirty_it->first.version + 1;
|
||||
deleted = IS_DELETE(dirty_it->second.state);
|
||||
unsynced = !IS_SYNCED(dirty_it->second.state);
|
||||
wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
|
||||
wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
|
||||
? !IS_SYNCED(dirty_it->second.state)
|
||||
@@ -81,10 +82,28 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
wait_del = true;
|
||||
PRIV(op)->real_version = op->version;
|
||||
op->version = version;
|
||||
flusher->unshift_flush((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = version-1,
|
||||
}, true);
|
||||
if (unsynced)
|
||||
{
|
||||
// Issue an additional sync so the delete reaches the journal
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->callback = [this, op](blockstore_op_t *sync_op)
|
||||
{
|
||||
flusher->unshift_flush((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version-1,
|
||||
}, true);
|
||||
delete sync_op;
|
||||
};
|
||||
enqueue_op(sync_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
flusher->unshift_flush((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = version-1,
|
||||
}, true);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -390,7 +409,23 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
);
|
||||
#endif
|
||||
// Figure out where data will be
|
||||
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size;
|
||||
auto next_next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size;
|
||||
if (op->len > 0)
|
||||
{
|
||||
auto journal_used_it = journal.used_sectors.lower_bound(next_next_free);
|
||||
if (journal_used_it != journal.used_sectors.end() &&
|
||||
journal_used_it->first < next_next_free + op->len)
|
||||
{
|
||||
printf(
|
||||
"BUG: Attempt to overwrite used offset (%lx, %lu refs) of the journal with the object %lx:%lx v%lu: data at %lx, len %x!"
|
||||
" Journal used_start=%08lx (%lu refs), next_free=%08lx, dirty_start=%08lx\n",
|
||||
journal_used_it->first, journal_used_it->second, op->oid.inode, op->oid.stripe, op->version, next_next_free, op->len,
|
||||
journal.used_start, journal.used_sectors[journal.used_start], journal.next_free, journal.dirty_start
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
journal.next_free = next_next_free;
|
||||
je->oid = op->oid;
|
||||
je->version = op->version;
|
||||
je->offset = op->offset;
|
||||
|
42
src/cli.cpp
42
src/cli.cpp
@@ -73,6 +73,37 @@ static const char* help_text =
|
||||
" <to> must be a child of <from> and <target> may be one of the layers between\n"
|
||||
" <from> and <to>, including <from> and <to>.\n"
|
||||
"\n"
|
||||
"vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>] [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>] [--min-offset <offset>] [--max-offset <offset>]\n"
|
||||
" Describe unclean object locations in the cluster.\n"
|
||||
" --osds <osds>\n"
|
||||
" Only list objects from primary OSD(s) <osds>.\n"
|
||||
" --object-state <states>\n"
|
||||
" Only list objects in given state(s). State(s) may include:\n"
|
||||
" degraded, misplaced, incomplete, corrupted, inconsistent.\n"
|
||||
" --pool <pool name or number>\n"
|
||||
" Only list objects in the given pool.\n"
|
||||
" --inode, --min-inode, --max-inode\n"
|
||||
" Restrict listing to specific inode numbers.\n"
|
||||
" --min-offset, --max-offset\n"
|
||||
" Restrict listing to specific offsets inside inodes.\n"
|
||||
"\n"
|
||||
"vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]\n"
|
||||
" Fix inconsistent objects in the cluster by deleting some copies.\n"
|
||||
" --objects <objects>\n"
|
||||
" Objects to fix, either in plain text or JSON format. If not specified,\n"
|
||||
" object list will be read from STDIN in one of the same formats.\n"
|
||||
" Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...\n"
|
||||
" JSON format: [{\"inode\":\"0x...\",\"stripe\":\"0x...\"},...]\n"
|
||||
" --bad-osds <osds>\n"
|
||||
" Remove inconsistent copies/parts of objects from these OSDs, effectively\n"
|
||||
" marking them bad and allowing Vitastor to recover objects from other copies.\n"
|
||||
" --part <number>\n"
|
||||
" Only remove EC part <number> (from 0 to pg_size-1), required for extreme\n"
|
||||
" edge cases where one OSD has multiple parts of a EC object.\n"
|
||||
" --check no\n"
|
||||
" Do not recheck that requested objects are actually inconsistent,\n"
|
||||
" delete requested copies/parts anyway.\n"
|
||||
"\n"
|
||||
"vitastor-cli alloc-osd\n"
|
||||
" Allocate a new OSD number and reserve it by creating empty /osd/stats/<n> key.\n"
|
||||
"\n"
|
||||
@@ -168,6 +199,7 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||
static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
{
|
||||
cli_result_t result = {};
|
||||
p->is_command_line = true;
|
||||
p->parse_config(cfg);
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
cfg.erase("command");
|
||||
@@ -276,6 +308,16 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
}
|
||||
action_cb = p->start_rm(cfg);
|
||||
}
|
||||
else if (cmd[0] == "describe")
|
||||
{
|
||||
// Describe unclean objects
|
||||
action_cb = p->start_describe(cfg);
|
||||
}
|
||||
else if (cmd[0] == "fix")
|
||||
{
|
||||
// Fix inconsistent objects (by deleting some copies)
|
||||
action_cb = p->start_fix(cfg);
|
||||
}
|
||||
else if (cmd[0] == "alloc-osd")
|
||||
{
|
||||
// Allocate a new OSD number
|
||||
|
@@ -34,12 +34,12 @@ public:
|
||||
bool list_first = false;
|
||||
bool json_output = false;
|
||||
int log_level = 0;
|
||||
bool is_command_line = false;
|
||||
bool color = false;
|
||||
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
bool no_recovery = false, no_rebalance = false, readonly = false;
|
||||
|
||||
int waiting = 0;
|
||||
cli_result_t etcd_err;
|
||||
@@ -56,6 +56,8 @@ public:
|
||||
friend struct snap_remover_t;
|
||||
|
||||
std::function<bool(cli_result_t &)> start_status(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_describe(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_fix(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_df(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_create(json11::Json);
|
||||
|
256
src/cli_describe.cpp
Normal file
256
src/cli_describe.cpp
Normal file
@@ -0,0 +1,256 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli_fix.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
#include "str_util.h"
|
||||
|
||||
std::vector<uint64_t> parse_uint64_list(json11::Json val)
|
||||
{
|
||||
std::vector<uint64_t> ret;
|
||||
if (val.is_number())
|
||||
ret.push_back(val.uint64_value());
|
||||
else if (val.is_string())
|
||||
{
|
||||
const std::string & s = val.string_value();
|
||||
for (int i = 0, p = -1; i <= s.size(); i++)
|
||||
{
|
||||
if (p < 0 && i < s.size() && (isdigit(s[i]) || s[i] == 'x'))
|
||||
p = i;
|
||||
else if (p >= 0 && (i >= s.size() || !isdigit(s[i]) && s[i] != 'x'))
|
||||
{
|
||||
ret.push_back(stoull_full(s.substr(p, i-p), 0));
|
||||
p = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (val.is_array())
|
||||
{
|
||||
for (auto & pg_num: val.array_items())
|
||||
ret.push_back(pg_num.uint64_value());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct cli_describe_t
|
||||
{
|
||||
uint64_t object_state = 0;
|
||||
pool_id_t only_pool = 0;
|
||||
std::vector<uint64_t> only_osds;
|
||||
uint64_t min_inode = 0, max_inode = 0;
|
||||
uint64_t min_offset = 0, max_offset = 0;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
int state = 0;
|
||||
int count = 0;
|
||||
|
||||
json11::Json options;
|
||||
cli_result_t result;
|
||||
json11::Json::array describe_items;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void parse_options(json11::Json cfg)
|
||||
{
|
||||
only_pool = cfg["pool"].uint64_value();
|
||||
if (!only_pool && cfg["pool"].is_string())
|
||||
{
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
if (pp.second.name == cfg["pool"].string_value())
|
||||
{
|
||||
only_pool = pp.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
min_inode = cfg["inode"].uint64_value();
|
||||
if (min_inode)
|
||||
{
|
||||
if (!INODE_POOL(min_inode))
|
||||
min_inode |= (uint64_t)only_pool << (64-POOL_ID_BITS);
|
||||
max_inode = min_inode;
|
||||
min_offset = max_offset = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
min_inode = stoull_full(cfg["min_inode"].string_value(), 0); // to support 0x...
|
||||
max_inode = stoull_full(cfg["max_inode"].string_value(), 0);
|
||||
min_offset = stoull_full(cfg["min_offset"].string_value(), 0);
|
||||
max_offset = stoull_full(cfg["max_offset"].string_value(), 0);
|
||||
if (!min_inode && !max_inode && only_pool)
|
||||
{
|
||||
min_inode = (uint64_t)only_pool << (64-POOL_ID_BITS);
|
||||
max_inode = ((uint64_t)only_pool << (64-POOL_ID_BITS)) |
|
||||
(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
}
|
||||
}
|
||||
only_osds = parse_uint64_list(cfg["osds"]);
|
||||
object_state = stoull_full(cfg["object_state"].string_value(), 0);
|
||||
if (!object_state && cfg["object_state"].is_string())
|
||||
{
|
||||
if (cfg["object_state"].string_value().find("inconsistent") != std::string::npos)
|
||||
object_state |= OBJ_INCONSISTENT;
|
||||
if (cfg["object_state"].string_value().find("corrupted") != std::string::npos)
|
||||
object_state |= OBJ_CORRUPTED;
|
||||
if (cfg["object_state"].string_value().find("incomplete") != std::string::npos)
|
||||
object_state |= OBJ_INCOMPLETE;
|
||||
if (cfg["object_state"].string_value().find("degraded") != std::string::npos)
|
||||
object_state |= OBJ_DEGRADED;
|
||||
if (cfg["object_state"].string_value().find("misplaced") != std::string::npos)
|
||||
object_state |= OBJ_MISPLACED;
|
||||
}
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
if (state == 100)
|
||||
return;
|
||||
parse_options(options);
|
||||
if (min_inode && !INODE_POOL(min_inode))
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!only_osds.size())
|
||||
{
|
||||
uint64_t min_pool = min_inode >> (64-POOL_ID_BITS);
|
||||
uint64_t max_pool = max_inode >> (64-POOL_ID_BITS);
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
if (pp.first >= min_pool && (!max_pool || pp.first <= max_pool))
|
||||
{
|
||||
for (auto & pgp: pp.second.pg_config)
|
||||
only_osds.push_back(pgp.second.cur_primary);
|
||||
}
|
||||
}
|
||||
}
|
||||
remove_duplicates(only_osds);
|
||||
parent->cli->init_msgr();
|
||||
if (parent->json_output && parent->is_command_line)
|
||||
{
|
||||
printf("[\n");
|
||||
}
|
||||
for (int i = 0; i < only_osds.size(); i++)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t;
|
||||
op->req = (osd_any_op_t){
|
||||
.describe = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DESCRIBE,
|
||||
},
|
||||
.object_state = object_state,
|
||||
.min_inode = min_inode,
|
||||
.min_offset = min_offset,
|
||||
.max_inode = max_inode,
|
||||
.max_offset = max_offset,
|
||||
},
|
||||
};
|
||||
op->callback = [this, osd_num = only_osds[i]](osd_op_t *op)
|
||||
{
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n",
|
||||
osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Invalid response size from OSD %lu (expected %lu bytes, got %lu bytes)\n",
|
||||
osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
|
||||
for (int i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
if (!parent->json_output || parent->is_command_line)
|
||||
{
|
||||
#define FMT "{\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"part\":%u,\"osd_num\":%lu%s%s%s}"
|
||||
printf(
|
||||
(parent->json_output
|
||||
? (count > 0 ? ",\n " FMT : " " FMT)
|
||||
: "%lx:%lx part %u on OSD %lu%s%s%s\n"),
|
||||
#undef FMT
|
||||
items[i].inode, items[i].stripe,
|
||||
items[i].role, items[i].osd_num,
|
||||
(items[i].loc_bad & LOC_CORRUPTED ? (parent->json_output ? ",\"corrupted\":true" : " corrupted") : ""),
|
||||
(items[i].loc_bad & LOC_INCONSISTENT ? (parent->json_output ? ",\"inconsistent\":true" : " inconsistent") : ""),
|
||||
(items[i].loc_bad & LOC_OUTDATED ? (parent->json_output ? ",\"outdated\":true" : " outdated") : "")
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto json_item = json11::Json::object {
|
||||
{ "inode", (uint64_t)items[i].inode },
|
||||
{ "stripe", (uint64_t)items[i].stripe },
|
||||
{ "part", (uint64_t)items[i].role },
|
||||
{ "osd_num", (uint64_t)items[i].osd_num },
|
||||
};
|
||||
if (items[i].loc_bad & LOC_CORRUPTED)
|
||||
json_item["corrupted"] = true;
|
||||
if (items[i].loc_bad & LOC_INCONSISTENT)
|
||||
json_item["inconsistent"] = true;
|
||||
if (items[i].loc_bad & LOC_OUTDATED)
|
||||
json_item["outdated"] = true;
|
||||
describe_items.push_back(json_item);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
parent->waiting--;
|
||||
if (!parent->waiting)
|
||||
loop();
|
||||
};
|
||||
parent->waiting++;
|
||||
parent->cli->execute_raw(only_osds[i], op);
|
||||
}
|
||||
resume_1:
|
||||
state = 1;
|
||||
if (parent->waiting > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (parent->json_output && parent->is_command_line)
|
||||
{
|
||||
printf(count > 0 ? "\n]\n" : "]\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
result.data = describe_items;
|
||||
}
|
||||
state = 100;
|
||||
describe_items.clear();
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_describe(json11::Json cfg)
|
||||
{
|
||||
auto describer = new cli_describe_t();
|
||||
describer->parent = this;
|
||||
describer->options = cfg;
|
||||
return [describer](cli_result_t & result)
|
||||
{
|
||||
describer->loop();
|
||||
if (describer->is_done())
|
||||
{
|
||||
result = describer->result;
|
||||
delete describer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
313
src/cli_fix.cpp
Normal file
313
src/cli_fix.cpp
Normal file
@@ -0,0 +1,313 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli_fix.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
#include "str_util.h"
|
||||
|
||||
struct cli_fix_t
|
||||
{
|
||||
std::vector<object_id> objects;
|
||||
int part = -1;
|
||||
int processed_count = 0;
|
||||
std::set<osd_num_t> bad_osds;
|
||||
bool no_check = false;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
int state = 0;
|
||||
|
||||
json11::Json options;
|
||||
cli_result_t result;
|
||||
json11::Json::array fix_result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void parse_objects_str(std::string str)
|
||||
{
|
||||
str = trim(str);
|
||||
if (str[0] == '[')
|
||||
{
|
||||
std::string json_err;
|
||||
json11::Json list = json11::Json::parse(str, json_err);
|
||||
if (json_err != "")
|
||||
fprintf(stderr, "Invalid JSON object list input: %s\n", json_err.c_str());
|
||||
else
|
||||
parse_object_list(list);
|
||||
}
|
||||
else
|
||||
{
|
||||
const char *s = str.c_str();
|
||||
char *e = NULL;
|
||||
int len = str.size();
|
||||
object_id oid;
|
||||
for (int p = 0; p < len; p++)
|
||||
{
|
||||
if (isdigit(s[p]))
|
||||
{
|
||||
int p0 = p;
|
||||
oid.inode = strtoull(s+p, &e, 0);
|
||||
p = e-s;
|
||||
while (p < len && !isdigit(s[p]) && s[p] != ':')
|
||||
p++;
|
||||
if (s[p] != ':')
|
||||
{
|
||||
fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
|
||||
continue;
|
||||
}
|
||||
p++;
|
||||
while (p < len && !isdigit(s[p]))
|
||||
p++;
|
||||
oid.stripe = strtoull(s+p, &e, 0) & ~STRIPE_MASK;
|
||||
p = e-s;
|
||||
if (oid.inode)
|
||||
objects.push_back(oid);
|
||||
else
|
||||
fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void parse_object_list(json11::Json list)
|
||||
{
|
||||
for (auto & obj: list.array_items())
|
||||
{
|
||||
object_id oid = (object_id){
|
||||
.inode = stoull_full(obj["inode"].string_value(), 0),
|
||||
.stripe = stoull_full(obj["stripe"].string_value(), 0) & ~STRIPE_MASK,
|
||||
};
|
||||
if (oid.inode)
|
||||
objects.push_back(oid);
|
||||
else
|
||||
fprintf(stderr, "Invalid JSON object ID in input: %s, bad or missing \"inode\" field\n", obj.dump().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void parse_options(json11::Json cfg)
|
||||
{
|
||||
json11::Json object_list;
|
||||
if (cfg["objects"].is_null())
|
||||
parse_objects_str(read_all_fd(0));
|
||||
else if (cfg["objects"].is_string())
|
||||
parse_objects_str(cfg["objects"].string_value());
|
||||
else
|
||||
parse_object_list(cfg["objects"].array_items());
|
||||
for (auto osd_num: parse_uint64_list(cfg["bad_osds"]))
|
||||
bad_osds.insert(osd_num);
|
||||
no_check = json_is_false(cfg["check"]);
|
||||
if (cfg["part"].is_number() || cfg["part"].is_string())
|
||||
part = cfg["part"].uint64_value();
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
if (state == 100)
|
||||
return;
|
||||
parse_options(options);
|
||||
if (!objects.size())
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Object list is not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!bad_osds.size())
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "OSDs are not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
remove_duplicates(objects);
|
||||
parent->cli->init_msgr();
|
||||
resume_1:
|
||||
state = 1;
|
||||
while (processed_count < objects.size())
|
||||
{
|
||||
if (parent->waiting >= parent->iodepth*parent->parallel_osds)
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto & obj = objects[processed_count++];
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Object %lx:%lx is from unknown pool\n", obj.inode, obj.stripe);
|
||||
continue;
|
||||
}
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
pg_num_t pg_num = (obj.stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
|
||||
auto pg_it = pool_cfg.pg_config.find(pg_num);
|
||||
if (pg_it == pool_cfg.pg_config.end() ||
|
||||
!pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Object %lx:%lx is from PG %u/%u which is not currently active\n",
|
||||
obj.inode, obj.stripe, pool_cfg_it->first, pg_num
|
||||
);
|
||||
continue;
|
||||
}
|
||||
osd_num_t primary_osd = pg_it->second.cur_primary;
|
||||
// Describe -> Remove some copies -> Scrub again
|
||||
osd_op_t *op = new osd_op_t;
|
||||
op->req = (osd_any_op_t){
|
||||
.describe = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DESCRIBE,
|
||||
},
|
||||
.min_inode = obj.inode,
|
||||
.min_offset = obj.stripe,
|
||||
.max_inode = obj.inode,
|
||||
.max_offset = obj.stripe,
|
||||
},
|
||||
};
|
||||
op->callback = [this, primary_osd, &obj](osd_op_t *op)
|
||||
{
|
||||
if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||
{
|
||||
fprintf(stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n", primary_osd, op->reply.hdr.retval);
|
||||
parent->waiting--;
|
||||
loop();
|
||||
}
|
||||
else
|
||||
{
|
||||
osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
|
||||
int *rm_count = (int*)malloc_or_die(sizeof(int));
|
||||
*rm_count = 1; // just in case if anything gets called instantly
|
||||
for (int i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
if (((items[i].loc_bad & LOC_INCONSISTENT) || no_check) &&
|
||||
bad_osds.find(items[i].osd_num) != bad_osds.end() &&
|
||||
(part == -1 || items[i].role == part))
|
||||
{
|
||||
// Remove
|
||||
uint64_t rm_osd_num = items[i].osd_num;
|
||||
osd_op_t *rm_op = new osd_op_t;
|
||||
rm_op->req = (osd_any_op_t){
|
||||
.sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_SEC_DELETE,
|
||||
},
|
||||
.oid = {
|
||||
.inode = op->req.describe.min_inode,
|
||||
.stripe = op->req.describe.min_offset | items[i].role,
|
||||
},
|
||||
.version = 0,
|
||||
},
|
||||
};
|
||||
rm_op->callback = [this, primary_osd, rm_osd_num, rm_count, &obj](osd_op_t *rm_op)
|
||||
{
|
||||
(*rm_count)--;
|
||||
if (rm_op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to remove object %lx:%lx from OSD %lu (retval=%ld)\n",
|
||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
|
||||
rm_osd_num, rm_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (parent->json_output)
|
||||
{
|
||||
fix_result.push_back(json11::Json::object {
|
||||
{ "inode", (uint64_t)rm_op->req.sec_del.oid.inode },
|
||||
{ "stripe", (uint64_t)rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK },
|
||||
{ "part", (uint64_t)rm_op->req.sec_del.oid.stripe & STRIPE_MASK },
|
||||
{ "osd_num", (uint64_t)rm_osd_num },
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Removed %lx:%lx (part %lu) from OSD %lu\n",
|
||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
|
||||
rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
|
||||
);
|
||||
}
|
||||
delete rm_op;
|
||||
if (!(*rm_count))
|
||||
{
|
||||
// Scrub
|
||||
free(rm_count);
|
||||
osd_op_t *scrub_op = new osd_op_t;
|
||||
scrub_op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_SCRUB,
|
||||
},
|
||||
.inode = obj.inode,
|
||||
.offset = obj.stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
scrub_op->callback = [this, primary_osd, &obj](osd_op_t *scrub_op)
|
||||
{
|
||||
if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to scrub %lx:%lx on OSD %lu (retval=%ld)\n",
|
||||
obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
delete scrub_op;
|
||||
parent->waiting--;
|
||||
loop();
|
||||
};
|
||||
parent->cli->execute_raw(primary_osd, scrub_op);
|
||||
}
|
||||
};
|
||||
(*rm_count)++;
|
||||
parent->cli->execute_raw(rm_osd_num, rm_op);
|
||||
}
|
||||
}
|
||||
(*rm_count)--;
|
||||
if (!*rm_count)
|
||||
{
|
||||
free(rm_count);
|
||||
parent->waiting--;
|
||||
loop();
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
};
|
||||
parent->waiting++;
|
||||
parent->cli->execute_raw(primary_osd, op);
|
||||
}
|
||||
if (parent->waiting > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (parent->json_output)
|
||||
{
|
||||
result.data = fix_result;
|
||||
}
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_fix(json11::Json cfg)
|
||||
{
|
||||
auto fixer = new cli_fix_t();
|
||||
fixer->parent = this;
|
||||
fixer->options = cfg;
|
||||
return [fixer](cli_result_t & result)
|
||||
{
|
||||
fixer->loop();
|
||||
if (fixer->is_done())
|
||||
{
|
||||
result = fixer->result;
|
||||
delete fixer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
26
src/cli_fix.h
Normal file
26
src/cli_fix.h
Normal file
@@ -0,0 +1,26 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cli.h"
|
||||
#include <algorithm>
|
||||
|
||||
std::vector<uint64_t> parse_uint64_list(json11::Json val);
|
||||
|
||||
template<class T> void remove_duplicates(std::vector<T> & ret)
|
||||
{
|
||||
if (!ret.size())
|
||||
return;
|
||||
std::sort(ret.begin(), ret.end());
|
||||
int j = 0;
|
||||
for (int i = 1; i < ret.size(); i++)
|
||||
{
|
||||
if (ret[i] != ret[j])
|
||||
ret[++j] = ret[i];
|
||||
}
|
||||
ret.resize(j+1);
|
||||
}
|
||||
|
||||
// from http_client.cpp...
|
||||
bool json_is_false(const json11::Json & val);
|
@@ -41,7 +41,7 @@ struct snap_merger_t
|
||||
int fsync_interval = 128;
|
||||
|
||||
// -- STATE --
|
||||
inode_t target;
|
||||
inode_t target, to_num;
|
||||
int target_rank;
|
||||
bool inside_continue = false;
|
||||
int state = 0;
|
||||
@@ -98,6 +98,7 @@ struct snap_merger_t
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
to_num = to_cfg->num;
|
||||
// Check that to_cfg is actually a child of from_cfg and target_cfg is somewhere between them
|
||||
std::vector<inode_t> chain_list;
|
||||
inode_config_t *cur = to_cfg;
|
||||
@@ -451,7 +452,7 @@ struct snap_merger_t
|
||||
{
|
||||
cluster_op_t *op = &rwo->op;
|
||||
op->opcode = OSD_OP_READ;
|
||||
op->inode = target;
|
||||
op->inode = to_num;
|
||||
op->offset = rwo->offset;
|
||||
op->len = target_block_size;
|
||||
op->iov.push_back(rwo->buf, target_block_size);
|
||||
@@ -483,7 +484,7 @@ struct snap_merger_t
|
||||
{
|
||||
// write start->end
|
||||
rwo->todo++;
|
||||
write_subop(rwo, rwo->start*gran, rwo->end*gran, use_cas ? 1+rwo->op.version : 0);
|
||||
write_subop(rwo, rwo->start*gran, rwo->end*gran, use_cas && to_num == target ? 1+rwo->op.version : 0);
|
||||
rwo->start = rwo->end;
|
||||
if (use_cas)
|
||||
{
|
||||
@@ -502,7 +503,7 @@ struct snap_merger_t
|
||||
{
|
||||
// write start->end
|
||||
rwo->todo++;
|
||||
write_subop(rwo, rwo->start*gran, rwo->end*gran, use_cas ? 1+rwo->op.version : 0);
|
||||
write_subop(rwo, rwo->start*gran, rwo->end*gran, use_cas && to_num == target ? 1+rwo->op.version : 0);
|
||||
rwo->start = rwo->end;
|
||||
if (use_cas)
|
||||
{
|
||||
@@ -532,7 +533,7 @@ struct snap_merger_t
|
||||
if (use_cas && subop->retval == -EINTR)
|
||||
{
|
||||
// CAS failure - reread and repeat optimistically
|
||||
rwo->start = subop->offset - rwo->offset;
|
||||
rwo->start = rwo->end = 0;
|
||||
rwo_read(rwo);
|
||||
delete subop;
|
||||
return;
|
||||
@@ -542,7 +543,7 @@ struct snap_merger_t
|
||||
rwo->error_read = false;
|
||||
}
|
||||
// Increment CAS version
|
||||
rwo->op.version++;
|
||||
rwo->op.version = subop->version;
|
||||
if (use_cas)
|
||||
next_write(rwo);
|
||||
else
|
||||
|
@@ -65,6 +65,9 @@ struct snap_remover_t
|
||||
int current_child = 0;
|
||||
std::function<bool(cli_result_t &)> cb;
|
||||
|
||||
std::vector<std::string> rebased_images, deleted_images;
|
||||
std::vector<uint64_t> deleted_ids;
|
||||
std::string inverse_child_name, inverse_parent_name;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
@@ -122,6 +125,7 @@ resume_1:
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
rebased_images.push_back(parent->cli->st_cli.inode_config.at(merge_children[current_child]).name);
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
if (state == 100)
|
||||
return;
|
||||
@@ -134,9 +138,12 @@ resume_2:
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
result.data = my_result(result.data);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
else if (parent->progress)
|
||||
printf("%s\n", result.text.c_str());
|
||||
parent->change_parent(merge_children[current_child], new_parent, &result);
|
||||
state = 3;
|
||||
resume_3:
|
||||
@@ -144,6 +151,7 @@ resume_3:
|
||||
return;
|
||||
if (result.err)
|
||||
{
|
||||
result.data = my_result(result.data);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
@@ -165,9 +173,12 @@ resume_4:
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
result.data = my_result(result.data);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
else if (parent->progress)
|
||||
printf("%s\n", result.text.c_str());
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
if (state == 100)
|
||||
@@ -181,9 +192,12 @@ resume_5:
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
result.data = my_result(result.data);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
else if (parent->progress)
|
||||
printf("%s\n", result.text.c_str());
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
@@ -199,6 +213,12 @@ resume_6:
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
{
|
||||
auto parent_it = parent->cli->st_cli.inode_config.find(chain_list[current_child]);
|
||||
if (parent_it != parent->cli->st_cli.inode_config.end())
|
||||
deleted_images.push_back(parent_it->second.name);
|
||||
deleted_ids.push_back(chain_list[current_child]);
|
||||
}
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb(result))
|
||||
@@ -209,9 +229,12 @@ resume_7:
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
result.data = my_result(result.data);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
else if (parent->progress)
|
||||
printf("%s\n", result.text.c_str());
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
if (state == 100)
|
||||
return;
|
||||
@@ -221,11 +244,26 @@ resume_8:
|
||||
return;
|
||||
}
|
||||
state = 100;
|
||||
result = (cli_result_t){
|
||||
.text = "",
|
||||
.data = my_result(result.data),
|
||||
};
|
||||
resume_100:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
json11::Json my_result(json11::Json src)
|
||||
{
|
||||
auto obj = src.object_items();
|
||||
obj["deleted_ids"] = deleted_ids;
|
||||
obj["deleted_images"] = deleted_images;
|
||||
obj["rebased_images"] = rebased_images;
|
||||
obj["renamed_from"] = inverse_parent_name;
|
||||
obj["renamed_to"] = inverse_child_name;
|
||||
return obj;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
@@ -338,7 +376,11 @@ resume_100:
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
if (inode_result["response_range"]["kvs"].array_items().size() == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["response_range"]["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
@@ -377,7 +419,7 @@ resume_100:
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
for (int i = chain_list.size()-1-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
@@ -413,8 +455,8 @@ resume_100:
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
inverse_child_name = child_cfg->name;
|
||||
inverse_parent_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
@@ -425,6 +467,9 @@ resume_100:
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
std::string target_idx_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/index/image/"+inverse_parent_name
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
@@ -449,6 +494,11 @@ resume_100:
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", target_idx_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
@@ -495,12 +545,12 @@ resume_100:
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
}, [this](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error renaming "+target_name+" to "+child_name+": "+err };
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error renaming "+inverse_parent_name+" to "+inverse_child_name+": "+err };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
@@ -508,14 +558,14 @@ resume_100:
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EAGAIN,
|
||||
.text = "Parent ("+target_name+"), child ("+child_name+"), or one of its children"
|
||||
.text = "Parent ("+inverse_parent_name+"), child ("+inverse_child_name+"), or one of its children"
|
||||
" configuration was modified during rename",
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (parent->progress)
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
printf("Layer %s renamed to %s\n", inverse_parent_name.c_str(), inverse_child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
@@ -28,6 +28,7 @@ struct rm_inode_t
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
std::vector<osd_num_t> inactive_osds;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
@@ -86,6 +87,16 @@ struct rm_inode_t
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inactive_osds = parent->cli->list_inode_get_inactive_osds(lister);
|
||||
if (inactive_osds.size() && !parent->json_output)
|
||||
{
|
||||
fprintf(stderr, "Some data may remain after delete on OSDs which are currently down: ");
|
||||
for (int i = 0; i < inactive_osds.size(); i++)
|
||||
{
|
||||
fprintf(stderr, i > 0 ? ", %lu" : "%lu", inactive_osds[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
}
|
||||
@@ -167,16 +178,33 @@ struct rm_inode_t
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
fprintf(stderr, "\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
if (parent->progress && total_count > 0)
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
if (parent->progress && (total_done < total_count || inactive_osds.size() > 0))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Warning: Pool:%u,ID:%lu inode data may not have been fully removed.\n"
|
||||
" Use `vitastor-cli rm-data --pool %u --inode %lu` if you encounter it in listings.\n",
|
||||
pool_id, INODE_NO_POOL(inode), pool_id, INODE_NO_POOL(inode)
|
||||
);
|
||||
}
|
||||
result = (cli_result_t){
|
||||
.err = error_count > 0 ? EIO : 0,
|
||||
.text = error_count > 0 ? "Some blocks were not removed" : (
|
||||
"Done, inode "+std::to_string(INODE_NO_POOL(inode))+" from pool "+
|
||||
std::to_string(pool_id)+" removed"),
|
||||
.data = json11::Json::object {
|
||||
{ "removed_objects", total_done },
|
||||
{ "total_objects", total_count },
|
||||
{ "inactive_osds", inactive_osds },
|
||||
},
|
||||
};
|
||||
state = 100;
|
||||
}
|
||||
|
@@ -410,14 +410,17 @@ struct rm_osd_t
|
||||
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
|
||||
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
|
||||
);
|
||||
auto hist = json11::Json::object {
|
||||
{ "epoch", pg_cfg.epoch },
|
||||
{ "all_peers", pg_cfg.all_peers },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
};
|
||||
if (pg_cfg.next_scrub)
|
||||
hist["next_scrub"] = pg_cfg.next_scrub;
|
||||
history_updates.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", history_key },
|
||||
{ "value", base64_encode(json11::Json(json11::Json::object {
|
||||
{ "epoch", pg_cfg.epoch },
|
||||
{ "all_peers", pg_cfg.all_peers },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
}).dump()) },
|
||||
{ "value", base64_encode(json11::Json(hist).dump()) },
|
||||
} },
|
||||
});
|
||||
history_checks.push_back(json11::Json::object {
|
||||
|
@@ -12,7 +12,7 @@ static const char *obj_states[] = { "clean", "misplaced", "degraded", "incomplet
|
||||
// Print cluster status:
|
||||
// etcd, mon, osd states
|
||||
// raw/used space, object states, pool states, pg states
|
||||
// client io, recovery io, rebalance io
|
||||
// client io, recovery io, rebalance io, scrub io
|
||||
struct status_printer_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
@@ -201,6 +201,7 @@ resume_2:
|
||||
bool readonly = json_is_true(parent->cli->config["readonly"]);
|
||||
bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
|
||||
bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
|
||||
bool no_scrub = json_is_true(parent->cli->config["no_scrub"]);
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
@@ -219,6 +220,7 @@ resume_2:
|
||||
{ "readonly", readonly },
|
||||
{ "no_recovery", no_recovery },
|
||||
{ "no_rebalance", no_rebalance },
|
||||
{ "no_scrub", no_scrub },
|
||||
{ "pool_count", pool_count },
|
||||
{ "active_pool_count", pools_active },
|
||||
{ "pg_states", pgs_by_state },
|
||||
@@ -250,18 +252,25 @@ resume_2:
|
||||
}
|
||||
more_states.resize(more_states.size()-2);
|
||||
std::string recovery_io;
|
||||
int io_indent = 0;
|
||||
{
|
||||
uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value();
|
||||
uint64_t deg_iops = agg_stats["recovery_stats"]["degraded"]["iops"].uint64_value();
|
||||
uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value();
|
||||
uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value();
|
||||
uint64_t scrub_bps = agg_stats["op_stats"]["scrub"]["bps"].uint64_value();
|
||||
uint64_t scrub_iops = agg_stats["op_stats"]["scrub"]["iops"].uint64_value();
|
||||
if (misp_iops > 0 || misp_bps > 0 || no_rebalance)
|
||||
io_indent = 3;
|
||||
else if (deg_iops > 0 || deg_bps > 0 || no_recovery)
|
||||
io_indent = 2;
|
||||
if (deg_iops > 0 || deg_bps > 0)
|
||||
{
|
||||
recovery_io += " recovery: "+std::string(no_recovery ? "disabled, " : "")+
|
||||
recovery_io += " recovery: "+str_repeat(" ", io_indent-2)+std::string(no_recovery ? "disabled, " : "")+
|
||||
format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
|
||||
}
|
||||
else if (no_recovery)
|
||||
recovery_io += " recovery: disabled\n";
|
||||
recovery_io += " recovery: disabled\n";
|
||||
if (misp_iops > 0 || misp_bps > 0)
|
||||
{
|
||||
recovery_io += " rebalance: "+std::string(no_rebalance ? "disabled, " : "")+
|
||||
@@ -269,6 +278,13 @@ resume_2:
|
||||
}
|
||||
else if (no_rebalance)
|
||||
recovery_io += " rebalance: disabled\n";
|
||||
if (scrub_iops > 0 || scrub_bps > 0)
|
||||
{
|
||||
recovery_io += " scrub: "+str_repeat(" ", io_indent+1)+std::string(no_scrub ? "disabled, " : "")+
|
||||
format_size(scrub_bps)+"/s, "+format_size(scrub_iops, true)+" op/s\n";
|
||||
}
|
||||
else if (no_scrub)
|
||||
recovery_io += " scrub: "+str_repeat(" ", io_indent+1)+"disabled\n";
|
||||
}
|
||||
printf(
|
||||
" cluster:\n"
|
||||
@@ -296,7 +312,7 @@ resume_2:
|
||||
pools_active, pool_count,
|
||||
pgs_by_state_str.c_str(),
|
||||
readonly ? " (read-only mode)" : "",
|
||||
recovery_io.size() > 0 ? " " : "",
|
||||
str_repeat(" ", io_indent).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_write"]["bps"].uint64_value()).c_str(),
|
||||
|
@@ -35,6 +35,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
// peer_osd just connected
|
||||
continue_ops();
|
||||
continue_lists();
|
||||
continue_raw_ops(peer_osd);
|
||||
}
|
||||
else if (dirty_buffers.size())
|
||||
{
|
||||
@@ -104,6 +105,19 @@ cluster_op_t::~cluster_op_t()
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_raw_ops(osd_num_t peer_osd)
|
||||
{
|
||||
auto it = raw_ops.find(peer_osd);
|
||||
while (it != raw_ops.end() && it->first == peer_osd)
|
||||
{
|
||||
auto op = it->second;
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds.at(peer_osd);
|
||||
msgr.outbox_push(op);
|
||||
raw_ops.erase(it++);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::init_msgr()
|
||||
{
|
||||
if (msgr_initialized)
|
||||
@@ -512,6 +526,23 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
|
||||
{
|
||||
auto fd_it = msgr.osd_peer_fds.find(osd_num);
|
||||
if (fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = fd_it->second;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (msgr.wanted_peers.find(osd_num) == msgr.wanted_peers.end())
|
||||
msgr.connect_peer(osd_num, st_cli.peer_states[osd_num]);
|
||||
raw_ops.emplace(osd_num, op);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
|
||||
{
|
||||
// Save operation for replay when one of PGs goes out of sync
|
||||
@@ -1178,6 +1209,10 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
copy_part_bitmap(op, part);
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
}
|
||||
else if (op->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
}
|
||||
if (op->inflight_count == 0)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
|
@@ -103,6 +103,7 @@ class cluster_client_t
|
||||
ring_consumer_t consumer;
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
std::vector<inode_list_t*> lists;
|
||||
std::multimap<osd_num_t, osd_op_t*> raw_ops;
|
||||
int continuing_ops = 0;
|
||||
bool msgr_initialized = false;
|
||||
|
||||
@@ -118,6 +119,7 @@ public:
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||
~cluster_client_t();
|
||||
void execute(cluster_op_t *op);
|
||||
void execute_raw(osd_num_t osd_num, osd_op_t *op);
|
||||
bool is_ready();
|
||||
void on_ready(std::function<void(void)> fn);
|
||||
|
||||
@@ -128,6 +130,7 @@ public:
|
||||
inode_list_t *list_inode_start(inode_t inode,
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
|
||||
int list_pg_count(inode_list_t *lst);
|
||||
const std::vector<osd_num_t> & list_inode_get_inactive_osds(inode_list_t *lst);
|
||||
void list_inode_next(inode_list_t *lst, int next_pgs);
|
||||
//inline uint32_t get_bs_bitmap_granularity() { return st_cli.global_bitmap_granularity; }
|
||||
//inline uint64_t get_bs_block_size() { return st_cli.global_block_size; }
|
||||
@@ -153,4 +156,5 @@ protected:
|
||||
void continue_lists();
|
||||
void continue_listing(inode_list_t *lst);
|
||||
void send_list(inode_list_osd_t *cur_list);
|
||||
void continue_raw_ops(osd_num_t peer_osd);
|
||||
};
|
||||
|
@@ -36,6 +36,7 @@ struct inode_list_t
|
||||
inode_t inode = 0;
|
||||
int done_pgs = 0;
|
||||
int want = 0;
|
||||
std::vector<osd_num_t> inactive_osds;
|
||||
std::vector<inode_list_pg_t*> pgs;
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback;
|
||||
};
|
||||
@@ -60,6 +61,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
||||
lst->inode = inode;
|
||||
lst->callback = callback;
|
||||
auto pool_cfg = st_cli.pool_config[pool_id];
|
||||
std::set<osd_num_t> inactive_osd_set;
|
||||
for (auto & pg_item: pool_cfg.pg_config)
|
||||
{
|
||||
auto & pg = pg_item.second;
|
||||
@@ -106,11 +108,18 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
||||
}
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
r->list_osds.push_back((inode_list_osd_t){
|
||||
.pg = r,
|
||||
.osd_num = peer_osd,
|
||||
.sent = false,
|
||||
});
|
||||
if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
|
||||
{
|
||||
r->list_osds.push_back((inode_list_osd_t){
|
||||
.pg = r,
|
||||
.osd_num = peer_osd,
|
||||
.sent = false,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
inactive_osd_set.insert(peer_osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -132,6 +141,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
||||
{
|
||||
lst->pgs[i]->pos = i;
|
||||
}
|
||||
lst->inactive_osds.insert(lst->inactive_osds.end(), inactive_osd_set.begin(), inactive_osd_set.end());
|
||||
lists.push_back(lst);
|
||||
return lst;
|
||||
}
|
||||
@@ -141,6 +151,11 @@ int cluster_client_t::list_pg_count(inode_list_t *lst)
|
||||
return lst->pgs.size();
|
||||
}
|
||||
|
||||
const std::vector<osd_num_t> & cluster_client_t::list_inode_get_inactive_osds(inode_list_t *lst)
|
||||
{
|
||||
return lst->inactive_osds;
|
||||
}
|
||||
|
||||
void cluster_client_t::list_inode_next(inode_list_t *lst, int next_pgs)
|
||||
{
|
||||
if (next_pgs >= 0)
|
||||
|
@@ -55,23 +55,6 @@ std::string realpath_str(std::string path, bool nofail)
|
||||
return rp;
|
||||
}
|
||||
|
||||
std::string read_all_fd(int fd)
|
||||
{
|
||||
int res_size = 0;
|
||||
std::string res;
|
||||
while (1)
|
||||
{
|
||||
res.resize(res_size+1024);
|
||||
int r = read(fd, (char*)res.data()+res_size, res.size()-res_size);
|
||||
if (r > 0)
|
||||
res_size += r;
|
||||
else if (!r || errno != EAGAIN && errno != EINTR)
|
||||
break;
|
||||
}
|
||||
res.resize(res_size);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string read_file(std::string file, bool allow_enoent)
|
||||
{
|
||||
std::string res;
|
||||
|
@@ -7,8 +7,8 @@
|
||||
#ifndef __MOCK__
|
||||
#include "addr_util.h"
|
||||
#include "http_client.h"
|
||||
#include "str_util.h"
|
||||
#endif
|
||||
#include "str_util.h"
|
||||
|
||||
etcd_state_client_t::~etcd_state_client_t()
|
||||
{
|
||||
@@ -777,6 +777,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
|
||||
continue;
|
||||
}
|
||||
// Scrub Interval
|
||||
pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
|
||||
if (!pc.scrub_interval)
|
||||
pc.scrub_interval = 0;
|
||||
// Immediate Commit Mode
|
||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||
? (pool_item.second["immediate_commit"].string_value() == "all"
|
||||
@@ -919,6 +923,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
// Read epoch
|
||||
pg_cfg.epoch = value["epoch"].uint64_value();
|
||||
// Next scrub timestamp (0 or empty = scrub is not needed)
|
||||
pg_cfg.next_scrub = value["next_scrub"].uint64_value();
|
||||
if (on_change_pg_history_hook != NULL)
|
||||
{
|
||||
on_change_pg_history_hook(pool_id, pg_num);
|
||||
|
@@ -39,6 +39,7 @@ struct pg_config_t
|
||||
osd_num_t cur_primary;
|
||||
int cur_state;
|
||||
uint64_t epoch;
|
||||
uint64_t next_scrub;
|
||||
};
|
||||
|
||||
struct pool_config_t
|
||||
@@ -55,6 +56,7 @@ struct pool_config_t
|
||||
uint64_t max_osd_combinations;
|
||||
uint64_t pg_stripe_size;
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
uint64_t scrub_interval;
|
||||
};
|
||||
|
||||
struct inode_config_t
|
||||
|
@@ -251,6 +251,10 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
return;
|
||||
}
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Connecting to OSD %lu at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
|
||||
}
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = peer_port;
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
@@ -313,7 +317,10 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
// Stop client
|
||||
fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
|
||||
}
|
||||
stop_client(peer_fd, true);
|
||||
}
|
||||
else if (epoll_events & EPOLLIN)
|
||||
|
@@ -50,7 +50,7 @@ struct osd_client_t
|
||||
|
||||
sockaddr_storage peer_addr;
|
||||
int peer_port;
|
||||
int peer_fd;
|
||||
int peer_fd = -1;
|
||||
int peer_state;
|
||||
int connect_timeout_id = -1;
|
||||
int ping_time_remaining = 0;
|
||||
@@ -87,11 +87,7 @@ struct osd_client_t
|
||||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||
|
||||
~osd_client_t()
|
||||
{
|
||||
free(in_buf);
|
||||
in_buf = NULL;
|
||||
}
|
||||
~osd_client_t();
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
@@ -136,6 +132,7 @@ protected:
|
||||
uint64_t rdma_max_msg = 0;
|
||||
#endif
|
||||
|
||||
bool has_send_loop = false;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||
@@ -163,6 +160,7 @@ public:
|
||||
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
|
||||
void read_requests();
|
||||
void send_replies();
|
||||
bool can_send();
|
||||
void accept_connections(int listen_fd);
|
||||
~osd_messenger_t();
|
||||
|
||||
@@ -177,6 +175,8 @@ public:
|
||||
bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
|
||||
#endif
|
||||
|
||||
void measure_exec(osd_op_t *cur_op);
|
||||
|
||||
protected:
|
||||
void try_connect_peer(uint64_t osd_num);
|
||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||
@@ -188,7 +188,6 @@ protected:
|
||||
void cancel_op(osd_op_t *op);
|
||||
|
||||
bool try_send(osd_client_t *cl);
|
||||
void measure_exec(osd_op_t *cur_op);
|
||||
void handle_send(int result, osd_client_t *cl);
|
||||
|
||||
bool handle_read(int result, osd_client_t *cl);
|
||||
|
@@ -9,6 +9,10 @@ osd_op_t::~osd_op_t()
|
||||
{
|
||||
assert(!bs_op);
|
||||
assert(!op_data);
|
||||
if (bitmap_buf)
|
||||
{
|
||||
free(bitmap_buf);
|
||||
}
|
||||
if (rmw_buf)
|
||||
{
|
||||
free(rmw_buf);
|
||||
|
@@ -165,6 +165,7 @@ struct osd_op_t
|
||||
void *bitmap = NULL;
|
||||
unsigned bitmap_len = 0;
|
||||
unsigned bmp_data = 0;
|
||||
void *bitmap_buf = NULL;
|
||||
void *rmw_buf = NULL;
|
||||
osd_primary_op_data_t* op_data = NULL;
|
||||
std::function<void(osd_op_t*)> callback;
|
||||
|
@@ -251,10 +251,6 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
}
|
||||
cl->read_remaining = cur_op->req.sec_read_bmp.len;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
cl->read_remaining = 0;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
||||
{
|
||||
if (cur_op->req.rw.len > 0)
|
||||
@@ -274,6 +270,12 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
}
|
||||
cl->read_remaining = cur_op->req.show_conf.json_len;
|
||||
}
|
||||
/*else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SCRUB ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
{
|
||||
cl->read_remaining = 0;
|
||||
}*/
|
||||
if (cl->read_remaining > 0)
|
||||
{
|
||||
// Read data
|
||||
@@ -367,6 +369,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
op->buf = malloc_or_die(op->reply.hdr.retval);
|
||||
cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
|
||||
}
|
||||
else if (op->reply.hdr.opcode == OSD_OP_DESCRIBE && op->reply.describe.result_bytes > 0)
|
||||
{
|
||||
delete cl->read_op;
|
||||
cl->read_op = op;
|
||||
cl->read_state = CL_READ_REPLY_DATA;
|
||||
cl->read_remaining = op->reply.describe.result_bytes;
|
||||
free(op->buf);
|
||||
op->buf = malloc_or_die(op->reply.describe.result_bytes);
|
||||
cl->recv_list.push_back(op->buf, op->reply.describe.result_bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
reuse:
|
||||
|
@@ -73,7 +73,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
? (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
|
||||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
: (cur_op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
|
||||
@@ -83,9 +84,12 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
{
|
||||
for (int i = 0; i < cur_op->iov.count; i++)
|
||||
{
|
||||
assert(cur_op->iov.buf[i].iov_base);
|
||||
to_send_list.push_back(cur_op->iov.buf[i]);
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
if (cur_op->iov.buf[i].iov_len > 0)
|
||||
{
|
||||
assert(cur_op->iov.buf[i].iov_base);
|
||||
to_send_list.push_back(cur_op->iov.buf[i]);
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
@@ -107,22 +111,28 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (!ringloop)
|
||||
if (!ringloop && !has_send_loop)
|
||||
{
|
||||
// FIXME: It's worse because it doesn't allow batching
|
||||
// "Send loop" should be used, like in QEMU driver, or performance will suffer
|
||||
// due to a large number of sendmsg() syscalls where they can be avoided.
|
||||
// "Send loop" increases 4k T1Q128 randread from ~55k iops to ~90k on a 1 OSD.
|
||||
// The difference is smaller when there are more OSDs though...
|
||||
while (cl->outbox.size())
|
||||
{
|
||||
try_send(cl);
|
||||
}
|
||||
}
|
||||
else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
|
||||
else if (cl->write_msg.msg_iovlen > 0 || has_send_loop || !try_send(cl))
|
||||
{
|
||||
if (cl->write_state == 0)
|
||||
{
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
if (ringloop)
|
||||
{
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,8 +159,10 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
|
||||
(cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
||||
cur_op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||
{
|
||||
// req.rw.len is internally set to the full object size for scrubs
|
||||
stats.op_stat_bytes[cur_op->req.hdr.opcode] += cur_op->req.rw.len;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
@@ -212,6 +224,12 @@ void osd_messenger_t::send_replies()
|
||||
write_ready_clients.clear();
|
||||
}
|
||||
|
||||
bool osd_messenger_t::can_send()
|
||||
{
|
||||
has_send_loop = true;
|
||||
return write_ready_clients.size() > 0;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
{
|
||||
cl->write_msg.msg_iovlen = 0;
|
||||
|
@@ -122,17 +122,6 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
// Cancel outbound operations
|
||||
cancel_osd_ops(cl);
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// And close the FD only when everything is done
|
||||
// ...because peer_fd number can get reused after close()
|
||||
close(peer_fd);
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->rdma_conn)
|
||||
{
|
||||
delete cl->rdma_conn;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
// Find the item again because it can be invalidated at this point
|
||||
it = clients.find(peer_fd);
|
||||
if (it != clients.end())
|
||||
@@ -145,3 +134,25 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
delete cl;
|
||||
}
|
||||
}
|
||||
|
||||
osd_client_t::~osd_client_t()
|
||||
{
|
||||
free(in_buf);
|
||||
in_buf = NULL;
|
||||
if (peer_fd >= 0)
|
||||
{
|
||||
// Close the FD only when the client is actually destroyed
|
||||
// Which only happens when all references are cleared
|
||||
close(peer_fd);
|
||||
peer_fd = -1;
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_conn)
|
||||
{
|
||||
delete rdma_conn;
|
||||
rdma_conn = NULL;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@@ -137,12 +137,19 @@ public:
|
||||
"OPTIONS:\n"
|
||||
" All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
|
||||
" --nbd_timeout 30\n"
|
||||
" timeout in seconds after which the kernel will stop the device\n"
|
||||
" you can set it to 0, but beware that you won't be able to stop the device at all\n"
|
||||
" if vitastor-nbd process dies\n"
|
||||
" Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
|
||||
" the device. You can set it to 0 to disable the timeout, but beware that you\n"
|
||||
" won't be able to stop the device at all if vitastor-nbd process dies.\n"
|
||||
" --nbd_max_devices 64 --nbd_max_part 3\n"
|
||||
" options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
||||
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n",
|
||||
" Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
||||
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
|
||||
" --logfile /path/to/log/file.txt\n"
|
||||
" Wite log messages to the specified file instead of dropping them (in background mode)\n"
|
||||
" or printing them to the standard output (in foreground mode).\n"
|
||||
" --dev_num N\n"
|
||||
" Use the specified device /dev/nbdN instead of automatic selection.\n"
|
||||
" --foreground 1\n"
|
||||
" Stay in foreground, do not daemonize.\n",
|
||||
exe_name, exe_name, exe_name
|
||||
);
|
||||
exit(0);
|
||||
|
56
src/osd.cpp
56
src/osd.cpp
@@ -13,6 +13,7 @@
|
||||
#include "osd_primary.h"
|
||||
#include "osd.h"
|
||||
#include "http_client.h"
|
||||
#include "str_util.h"
|
||||
|
||||
static blockstore_config_t json_to_bs(const json11::Json::object & config)
|
||||
{
|
||||
@@ -168,6 +169,8 @@ void osd_t::parse_config(bool init)
|
||||
no_rebalance = json_is_true(config["no_rebalance"]);
|
||||
auto old_no_recovery = no_recovery;
|
||||
no_recovery = json_is_true(config["no_recovery"]);
|
||||
auto old_no_scrub = no_scrub;
|
||||
no_scrub = json_is_true(config["no_scrub"]);
|
||||
auto old_autosync_interval = autosync_interval;
|
||||
if (!config["autosync_interval"].is_null())
|
||||
{
|
||||
@@ -207,6 +210,38 @@ void osd_t::parse_config(bool init)
|
||||
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
||||
if (!inode_vanish_time)
|
||||
inode_vanish_time = 60;
|
||||
auto old_auto_scrub = auto_scrub;
|
||||
auto_scrub = json_is_true(config["auto_scrub"]);
|
||||
global_scrub_interval = parse_time(config["scrub_interval"].string_value());
|
||||
if (!global_scrub_interval)
|
||||
global_scrub_interval = 30*86400;
|
||||
scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
|
||||
if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
scrub_queue_depth = 1;
|
||||
scrub_find_best = !json_is_false(config["scrub_find_best"]);
|
||||
scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value();
|
||||
if (scrub_ec_max_bruteforce < 1)
|
||||
scrub_ec_max_bruteforce = 100;
|
||||
scrub_sleep_ms = config["scrub_sleep"].uint64_value();
|
||||
scrub_list_limit = config["scrub_list_limit"].uint64_value();
|
||||
if (!scrub_list_limit)
|
||||
scrub_list_limit = 1000;
|
||||
if (!old_auto_scrub && auto_scrub)
|
||||
{
|
||||
// Schedule scrubbing
|
||||
for (auto & pgp: pgs)
|
||||
{
|
||||
plan_scrub(pgp.second);
|
||||
}
|
||||
}
|
||||
if (old_no_scrub && !no_scrub)
|
||||
{
|
||||
// Wakeup scrubbing
|
||||
for (auto & pgp: pgs)
|
||||
{
|
||||
schedule_scrub(pgp.second);
|
||||
}
|
||||
}
|
||||
if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
|
||||
!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
||||
{
|
||||
@@ -337,6 +372,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_DESCRIBE &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
||||
{
|
||||
// Readonly mode
|
||||
@@ -367,6 +404,14 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
{
|
||||
continue_primary_del(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||
{
|
||||
continue_primary_scrub(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
{
|
||||
continue_primary_describe(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
exec_secondary(cur_op);
|
||||
@@ -431,6 +476,10 @@ void osd_t::print_stats()
|
||||
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
|
||||
}
|
||||
}
|
||||
if (corrupted_objects > 0)
|
||||
{
|
||||
printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
|
||||
}
|
||||
if (incomplete_objects > 0)
|
||||
{
|
||||
printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
|
||||
@@ -498,10 +547,11 @@ void osd_t::print_slow()
|
||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||
{
|
||||
bufprintf(
|
||||
" inode=%lx-%lx pg=%u/%u, stripe=%lu",
|
||||
op->req.sec_list.min_inode, op->req.sec_list.max_inode,
|
||||
" oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
|
||||
op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
|
||||
op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
|
||||
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
|
||||
op->req.sec_list.pg_stripe_size
|
||||
op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
|
||||
);
|
||||
}
|
||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
|
41
src/osd.h
41
src/osd.h
@@ -28,6 +28,7 @@
|
||||
#define OSD_PEERING_PGS 0x04
|
||||
#define OSD_FLUSHING_PGS 0x08
|
||||
#define OSD_RECOVERING 0x10
|
||||
#define OSD_SCRUBBING 0x20
|
||||
|
||||
#define MAX_AUTOSYNC_INTERVAL 3600
|
||||
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
||||
@@ -98,6 +99,7 @@ class osd_t
|
||||
bool run_primary = false;
|
||||
bool no_rebalance = false;
|
||||
bool no_recovery = false;
|
||||
bool no_scrub = false;
|
||||
std::string bind_address;
|
||||
int bind_port, listen_backlog = 128;
|
||||
// FIXME: Implement client queue depth limit
|
||||
@@ -113,6 +115,13 @@ class osd_t
|
||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
int inode_vanish_time = 60;
|
||||
int log_level = 0;
|
||||
bool auto_scrub = false;
|
||||
uint64_t global_scrub_interval = 30*86400;
|
||||
uint64_t scrub_queue_depth = 1;
|
||||
uint64_t scrub_sleep_ms = 0;
|
||||
uint32_t scrub_list_limit = 1000;
|
||||
bool scrub_find_best = true;
|
||||
uint64_t scrub_ec_max_bruteforce = 100;
|
||||
|
||||
// cluster state
|
||||
|
||||
@@ -135,15 +144,24 @@ class osd_t
|
||||
std::set<pool_pg_num_t> dirty_pgs;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
int copies_to_delete_after_sync_count = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0;
|
||||
int peering_state = 0;
|
||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||
std::map<object_id, osd_op_t*> scrub_ops;
|
||||
bool recovery_last_degraded = true;
|
||||
pool_pg_num_t recovery_last_pg;
|
||||
object_id recovery_last_oid;
|
||||
int recovery_pg_done = 0, recovery_done = 0;
|
||||
osd_op_t *autosync_op = NULL;
|
||||
|
||||
// Scrubbing
|
||||
uint64_t scrub_nearest_ts = 0;
|
||||
int scrub_timer_id = -1;
|
||||
pool_pg_num_t scrub_last_pg = {};
|
||||
osd_op_t *scrub_list_op = NULL;
|
||||
pg_list_result_t scrub_cur_list = {};
|
||||
uint64_t scrub_list_pos = 0;
|
||||
|
||||
// Unstable writes
|
||||
uint64_t unstable_write_count = 0;
|
||||
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
||||
@@ -221,6 +239,14 @@ class osd_t
|
||||
bool continue_recovery();
|
||||
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
|
||||
|
||||
// scrub
|
||||
void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
|
||||
int pick_next_scrub(object_id & next_oid);
|
||||
void submit_scrub_op(object_id oid);
|
||||
bool continue_scrub();
|
||||
void plan_scrub(pg_t & pg, bool report_state = true);
|
||||
void schedule_scrub(pg_t & pg);
|
||||
|
||||
// op execution
|
||||
void exec_op(osd_op_t *cur_op);
|
||||
void finish_op(osd_op_t *cur_op, int retval);
|
||||
@@ -235,13 +261,19 @@ class osd_t
|
||||
void autosync();
|
||||
bool prepare_primary_rw(osd_op_t *cur_op);
|
||||
void continue_primary_read(osd_op_t *cur_op);
|
||||
void continue_primary_scrub(osd_op_t *cur_op);
|
||||
void continue_primary_describe(osd_op_t *cur_op);
|
||||
void continue_primary_write(osd_op_t *cur_op);
|
||||
void cancel_primary_write(osd_op_t *cur_op);
|
||||
void continue_primary_sync(osd_op_t *cur_op);
|
||||
void continue_primary_del(osd_op_t *cur_op);
|
||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
||||
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
||||
pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
||||
uint64_t old_pg_state, int log_at_level);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
|
||||
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
|
||||
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
void handle_primary_bs_subop(osd_op_t *subop);
|
||||
@@ -256,10 +288,11 @@ class osd_t
|
||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
|
||||
|
||||
void continue_chained_read(osd_op_t *cur_op);
|
||||
int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
|
||||
void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
|
||||
void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
|
||||
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
|
||||
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user