Compare commits

..

1 Commits

Author SHA1 Message Date
Vitaliy Filippov 1bb33f372c Add bindiff for tests
Test / test_rebalance_verify_imm (push) Successful in 1m40s Details
Test / test_dd (push) Successful in 12s Details
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec (push) Successful in 1m46s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m51s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_osd_tags (push) Successful in 10s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_nfs (push) Successful in 10s Details
Test / test_scrub_ec (push) Successful in 15s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m20s Details
Test / test_heal_csum_32k (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m12s Details
Test / test_heal_csum_4k (push) Successful in 2m16s Details
2024-09-21 20:26:19 +03:00
92 changed files with 624 additions and 2217 deletions

View File

@ -22,7 +22,7 @@ RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \ RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted RUN apt-get -y install jq lp-solve sudo nfs-common
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN set -ex; \ RUN set -ex; \

View File

@ -828,60 +828,6 @@ jobs:
echo "" echo ""
done done
test_resize:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_resize.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_resize_auto:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_resize_auto.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_snapshot_pool2:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_snapshot_pool2.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_osd_tags: test_osd_tags:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: build needs: build

View File

@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor) project(vitastor)
set(VITASTOR_VERSION "1.9.3") set(VITASTOR_VERSION "1.9.1")
add_subdirectory(src) add_subdirectory(src)

View File

@ -1,4 +1,4 @@
# Vitastor ## Vitastor
[Read English version](README.md) [Read English version](README.md)
@ -22,7 +22,7 @@ TCP и RDMA и на хорошем железе может достигать з
Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes. Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
Другие драйверы могут также быть легко реализованы. Другие драйверы могут также быть легко реализованы.
Подробности смотрите в документации по ссылкам. Можете начать отсюда: [Быстрый старт](docs/intro/quickstart.ru.md). Подробности смотрите в документации по ссылкам ниже.
## Презентации и записи докладов ## Презентации и записи докладов
@ -51,7 +51,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- Параметры - Параметры
- [Общие](docs/config/common.ru.md) - [Общие](docs/config/common.ru.md)
- [Сетевые](docs/config/network.ru.md) - [Сетевые](docs/config/network.ru.md)
- [Клиентский код](docs/config/client.ru.md) - [Клиентский код](docs/config/client.en.md)
- [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md) - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
- [Дисковые параметры OSD](docs/config/layout-osd.ru.md) - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
- [Прочие параметры OSD](docs/config/osd.ru.md) - [Прочие параметры OSD](docs/config/osd.ru.md)

View File

@ -22,7 +22,7 @@ or internal systems of public clouds.
Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers. Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
More drivers may be created easily. More drivers may be created easily.
Read more details in the documentation. You can start from here: [Quick Start](docs/intro/quickstart.en.md). Read more details below in the documentation.
## Talks and presentations ## Talks and presentations

View File

@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v1.9.3 VITASTOR_VERSION ?= v1.9.1
all: build push all: build push

View File

@ -49,7 +49,7 @@ spec:
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.9.3 image: vitalif/vitastor-csi:v1.9.1
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@ -121,7 +121,7 @@ spec:
privileged: true privileged: true
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.9.3 image: vitalif/vitastor-csi:v1.9.1
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@ -3,10 +3,10 @@ module vitastor.io/csi
go 1.15 go 1.15
require ( require (
github.com/container-storage-interface/spec v1.8.0 github.com/container-storage-interface/spec v1.4.0
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
github.com/kubernetes-csi/csi-lib-utils v0.9.1 github.com/kubernetes-csi/csi-lib-utils v0.9.1
golang.org/x/net v0.7.0 golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/grpc v1.33.1 google.golang.org/grpc v1.33.1
google.golang.org/protobuf v1.24.0 google.golang.org/protobuf v1.24.0

View File

@ -41,8 +41,8 @@ github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWR
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
github.com/container-storage-interface/spec v1.2.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4= github.com/container-storage-interface/spec v1.2.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
github.com/container-storage-interface/spec v1.8.0 h1:D0vhF3PLIZwlwZEf2eNbpujGCNwspwTYf2idJRJx4xI= github.com/container-storage-interface/spec v1.4.0 h1:ozAshSKxpJnYUfmkpZCTYyF/4MYeYlhdXbAvPvfGmkg=
github.com/container-storage-interface/spec v1.8.0/go.mod h1:ROLik+GhPslwwWRNFF1KasPzroNARibH2rfz1rkg4H0= github.com/container-storage-interface/spec v1.4.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -182,7 +182,6 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
@ -196,7 +195,6 @@ golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8U
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@ -215,7 +213,6 @@ golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCc
golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@ -231,10 +228,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb h1:eBmm0M9fYhWpKZLjQUUKka/LtIxf46G4fxeEz5KJr9U=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@ -245,7 +240,6 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@ -265,22 +259,13 @@ golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@ -301,10 +286,8 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

View File

@ -5,7 +5,7 @@ package vitastor
const ( const (
vitastorCSIDriverName = "csi.vitastor.io" vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.9.3" vitastorCSIDriverVersion = "1.9.1"
) )
// Config struct fills the parameters of request or user input // Config struct fills the parameters of request or user input

View File

@ -8,9 +8,11 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"strings" "strings"
"bytes"
"strconv" "strconv"
"time" "time"
"os" "os"
"os/exec"
"io/ioutil" "io/ioutil"
"github.com/kubernetes-csi/csi-lib-utils/protosanitizer" "github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
@ -112,6 +114,22 @@ func GetConnectionParams(params map[string]string) (map[string]string, error)
return ctxVars, nil return ctxVars, nil
} }
func system(program string, args ...string) ([]byte, []byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
c := exec.Command(program, args...)
var stdout, stderr bytes.Buffer
c.Stdout, c.Stderr = &stdout, &stderr
err := c.Run()
if (err != nil)
{
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
}
return stdout.Bytes(), stderr.Bytes(), nil
}
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error) func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
{ {
if (ctxVars["configPath"] != "") if (ctxVars["configPath"] != "")
@ -140,12 +158,6 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
return nil, status.Error(codes.InvalidArgument, "volume capabilities is a required field") return nil, status.Error(codes.InvalidArgument, "volume capabilities is a required field")
} }
err := cs.checkCaps(volumeCapabilities)
if (err != nil)
{
return nil, err
}
etcdVolumePrefix := req.Parameters["etcdVolumePrefix"] etcdVolumePrefix := req.Parameters["etcdVolumePrefix"]
poolId, _ := strconv.ParseUint(req.Parameters["poolId"], 10, 64) poolId, _ := strconv.ParseUint(req.Parameters["poolId"], 10, 64)
if (poolId == 0) if (poolId == 0)
@ -289,44 +301,13 @@ func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req
return nil, status.Error(codes.InvalidArgument, "volumeCapabilities is nil") return nil, status.Error(codes.InvalidArgument, "volumeCapabilities is nil")
} }
err := cs.checkCaps(volumeCapabilities)
if (err != nil)
{
return nil, err
}
return &csi.ValidateVolumeCapabilitiesResponse{
Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
VolumeCapabilities: req.VolumeCapabilities,
},
}, nil
}
func (cs *ControllerServer) checkCaps(volumeCapabilities []*csi.VolumeCapability) error
{
var volumeCapabilityAccessModes []*csi.VolumeCapability_AccessMode var volumeCapabilityAccessModes []*csi.VolumeCapability_AccessMode
for _, mode := range []csi.VolumeCapability_AccessMode_Mode{ for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER, csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY, csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY,
csi.VolumeCapability_AccessMode_SINGLE_NODE_SINGLE_WRITER,
csi.VolumeCapability_AccessMode_SINGLE_NODE_MULTI_WRITER,
} { } {
volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode}) volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
} }
for _, capability := range volumeCapabilities
{
if (capability.GetBlock() != nil)
{
for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
csi.VolumeCapability_AccessMode_MULTI_NODE_SINGLE_WRITER,
csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
} {
volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
}
break
}
}
capabilitySupport := false capabilitySupport := false
for _, capability := range volumeCapabilities for _, capability := range volumeCapabilities
@ -342,10 +323,14 @@ func (cs *ControllerServer) checkCaps(volumeCapabilities []*csi.VolumeCapability
if (!capabilitySupport) if (!capabilitySupport)
{ {
return status.Errorf(codes.NotFound, "%v not supported", volumeCapabilities) return nil, status.Errorf(codes.NotFound, "%v not supported", req.GetVolumeCapabilities())
} }
return nil return &csi.ValidateVolumeCapabilitiesResponse{
Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
VolumeCapabilities: req.VolumeCapabilities,
},
}, nil
} }
// ListVolumes returns a list of volumes // ListVolumes returns a list of volumes

View File

@ -227,32 +227,7 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
isBlock := req.GetVolumeCapability().GetBlock() != nil isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted // Check that it's not already mounted
notmnt, err := mount.IsNotMountPoint(ns.mounter, targetPath) _, err = mount.IsNotMountPoint(ns.mounter, targetPath)
if (err == nil)
{
if (!notmnt)
{
klog.Errorf("target path %s is already mounted", targetPath)
return nil, fmt.Errorf("target path %s is already mounted", targetPath)
}
var finfo os.FileInfo
finfo, err = os.Stat(targetPath)
if (err != nil)
{
klog.Errorf("failed to stat %s: %v", targetPath, err)
return nil, err
}
if (finfo.IsDir() != (!isBlock))
{
err = os.Remove(targetPath)
if (err != nil)
{
klog.Errorf("failed to remove %s (to recreate it with correct type): %v", targetPath, err)
return nil, err
}
err = os.ErrNotExist
}
}
if (err != nil) if (err != nil)
{ {
if (os.IsNotExist(err)) if (os.IsNotExist(err))
@ -305,7 +280,6 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()} diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
if (isBlock) if (isBlock)
{ {
klog.Infof("bind-mounting %s to %s", devicePath, targetPath)
err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"}) err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
} }
else else
@ -335,40 +309,39 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
readOnly := Contains(opt, "ro") readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly) if (existingFormat == "" && !readOnly)
{ {
var cmdOut []byte
switch fsType switch fsType
{ {
case "ext4": case "ext4":
args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath} args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
_, err = systemCombined("mkfs.ext4", args...) cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
case "xfs": case "xfs":
_, err = systemCombined("mkfs.xfs", "-K", devicePath) cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
} }
if (err != nil) if (err != nil)
{ {
klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
goto unmap goto unmap
} }
} }
klog.Infof("formatting and mounting %s to %s with FS %s, options: %v", devicePath, targetPath, fsType, opt)
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt) err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
if (err == nil)
{
klog.Infof("successfully mounted %s to %s", devicePath, targetPath)
}
// Try to run online resize on mount. // Try to run online resize on mount.
// FIXME: Implement online resize. It requires online resize support in vitastor-nbd. // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
if (err == nil && existingFormat != "" && !readOnly) if (err == nil && existingFormat != "" && !readOnly)
{ {
var cmdOut []byte
switch (fsType) switch (fsType)
{ {
case "ext4": case "ext4":
_, err = systemCombined("resize2fs", devicePath) cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
case "xfs": case "xfs":
_, err = systemCombined("xfs_growfs", devicePath) cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
} }
if (err != nil) if (err != nil)
{ {
klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
goto unmap goto unmap
} }
} }
@ -412,7 +385,7 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName) defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
targetPath := req.GetStagingTargetPath() targetPath := req.GetStagingTargetPath()
devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath) devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
if (err != nil) if (err != nil)
{ {
if (os.IsNotExist(err)) if (os.IsNotExist(err))
@ -429,16 +402,6 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
return &csi.NodeUnstageVolumeResponse{}, nil return &csi.NodeUnstageVolumeResponse{}, nil
} }
refList, err := ns.mounter.GetMountRefs(targetPath)
if (err != nil)
{
return nil, err
}
if (len(refList) > 0)
{
klog.Warningf("%s is still referenced: %v", targetPath, refList)
}
// unmount // unmount
err = mount.CleanupMountPoint(targetPath, ns.mounter, false) err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
if (err != nil) if (err != nil)
@ -447,7 +410,7 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
} }
// unmap device // unmap device
if (len(refList) == 0) if (refCount == 1)
{ {
if (!ns.useVduse) if (!ns.useVduse)
{ {
@ -488,20 +451,15 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
isBlock := req.GetVolumeCapability().GetBlock() != nil isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that stagingTargetPath is mounted // Check that stagingTargetPath is mounted
notmnt, err := mount.IsNotMountPoint(ns.mounter, stagingTargetPath) _, err = mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
if (err != nil) if (err != nil)
{ {
klog.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err) klog.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
return nil, fmt.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err) return nil, fmt.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
}
else if (notmnt)
{
klog.Errorf("staging path %v is not mounted", stagingTargetPath)
return nil, fmt.Errorf("staging path %v is not mounted", stagingTargetPath)
} }
// Check that targetPath is not already mounted // Check that targetPath is not already mounted
notmnt, err = mount.IsNotMountPoint(ns.mounter, targetPath) _, err = mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil) if (err != nil)
{ {
if (os.IsNotExist(err)) if (os.IsNotExist(err))
@ -536,11 +494,6 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
return nil, err return nil, err
} }
} }
else if (!notmnt)
{
klog.Errorf("target path %s is already mounted", targetPath)
return nil, fmt.Errorf("target path %s is already mounted", targetPath)
}
execArgs := []string{"--bind", stagingTargetPath, targetPath} execArgs := []string{"--bind", stagingTargetPath, targetPath}
if (req.GetReadonly()) if (req.GetReadonly())

View File

@ -4,7 +4,6 @@
package vitastor package vitastor
import ( import (
"bytes"
"errors" "errors"
"encoding/json" "encoding/json"
"fmt" "fmt"
@ -16,8 +15,6 @@ import (
"syscall" "syscall"
"k8s.io/klog" "k8s.io/klog"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
) )
func Contains(list []string, s string) bool func Contains(list []string, s string) bool
@ -76,10 +73,6 @@ func checkVduseSupport() bool
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.", " For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
) )
} }
else
{
klog.Infof("VDUSE support enabled successfully")
}
return vduse return vduse
} }
@ -104,7 +97,6 @@ func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, e
{ {
return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr) return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
} }
klog.Infof("Attached volume %s via NBD as %s", volName, dev)
return dev, err return dev, err
} }
@ -225,7 +217,6 @@ func mapVduse(stateDir string, volName string, ctxVars map[string]string, readon
err = os.WriteFile(stateFile, stateJSON, 0600) err = os.WriteFile(stateFile, stateJSON, 0600)
if (err == nil) if (err == nil)
{ {
klog.Infof("Attached volume %s via VDUSE as %s (VDPA ID %s)", volName, blockdev, vdpaId)
return blockdev, vdpaId, nil return blockdev, vdpaId, nil
} }
} }
@ -308,35 +299,3 @@ func unmapVduseById(stateDir, vdpaId string)
os.Remove(pidFile) os.Remove(pidFile)
} }
} }
func system(program string, args ...string) ([]byte, []byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
c := exec.Command(program, args...)
var stdout, stderr bytes.Buffer
c.Stdout, c.Stderr = &stdout, &stderr
err := c.Run()
if (err != nil)
{
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s\nOutput:\n%s", err, stdoutStr+stderrStr)
return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
}
return stdout.Bytes(), stderr.Bytes(), nil
}
func systemCombined(program string, args ...string) ([]byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
c := exec.Command(program, args...)
var out bytes.Buffer
c.Stdout, c.Stderr = &out, &out
err := c.Run()
if (err != nil)
{
outStr := string(out.Bytes())
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", outStr, err)
return nil, status.Error(codes.Internal, outStr+" (status "+err.Error()+")")
}
return out.Bytes(), nil
}

2
debian/changelog vendored
View File

@ -1,4 +1,4 @@
vitastor (1.9.3-1) unstable; urgency=medium vitastor (1.9.1-1) unstable; urgency=medium
* Bugfixes * Bugfixes

View File

@ -106,8 +106,8 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
it (they have internal SSD cache even though it's not stated in datasheets). it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling Setting this parameter to "all" or "small" in OSD parameters requires enabling
[disable_journal_fsync](layout-osd.en.md#disable_journal_fsync) and [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.md#disable_meta_fsync), setting it to [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.md#disable_data_fsync). "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
vitastor-disk tried to do that by default, first checking/disabling drive cache. vitastor-disk tried to do that by default, first checking/disabling drive cache.
If it can't disable drive cache, OSD get initialized with "none". If it can't disable drive cache, OSD get initialized with "none".

View File

@ -112,6 +112,6 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
указано в спецификациях). указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует Указание "all" или "small" в настройках / командной строке OSD требует
включения [disable_journal_fsync](layout-osd.ru.md#disable_journal_fsync) и включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.md#disable_meta_fsync), значение "all" [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.md#disable_data_fsync). также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).

View File

@ -118,13 +118,12 @@ Physical block size of the journal device. Must be a multiple of
- Type: boolean - Type: boolean
- Default: false - Default: false
Do not issue fsyncs to the data device, i.e. do not force it to flush cache. Do not issue fsyncs to the data device, i.e. do not flush its cache.
Safe ONLY if your data device has write-through cache or if write-back Safe ONLY if your data device has write-through cache. If you disable
cache is disabled. If you disable drive cache manually with `hdparm` or the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
writing to `/sys/.../scsi_disk/cache_type` then make sure that you do it that the cache disable command is run every time before starting Vitastor
every time before starting Vitastor OSD (vitastor-disk does it automatically). OSD, for example, in the systemd unit. See also `immediate_commit` option
See also [immediate_commit](layout-cluster.en.md#immediate_commit) for the instructions to disable cache and how to benefit from it.
for information about how to benefit from disabled cache.
## disable_meta_fsync ## disable_meta_fsync
@ -172,7 +171,8 @@ size, it actually has to write the whole 4 KB sector.
Because of this it can actually be beneficial to use SSDs which work well Because of this it can actually be beneficial to use SSDs which work well
with 512 byte sectors and use 512 byte disk_alignment, journal_block_size with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
and meta_block_size. But at the moment, no such SSDs are known... and meta_block_size. But the only SSD that may fit into this category is
Intel Optane (probably, not tested yet).
Clients don't need to be aware of disk_alignment, so it's not required to Clients don't need to be aware of disk_alignment, so it's not required to
put a modified value into etcd key /vitastor/config/global. put a modified value into etcd key /vitastor/config/global.

View File

@ -122,14 +122,13 @@ SSD-диске, иначе производительность пострада
- Тип: булево (да/нет) - Тип: булево (да/нет)
- Значение по умолчанию: false - Значение по умолчанию: false
Не отправлять fsync-и устройству данных, т.е. не заставлять его сбрасывать кэш. Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
записью (write-through) или если кэш с отложенной записью (write-back) отключён. записью (write-through). Если вы отключаете кэш через `hdparm` или
Если вы отключаете кэш вручную через `hdparm` или запись в `/sys/.../scsi_disk/cache_type`, `scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
то удостоверьтесь, что вы делаете это каждый раз перед запуском Vitastor OSD выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
(vitastor-disk делает это автоматически). Смотрите также опцию Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
[immediate_commit](layout-cluster.ru.md#immediate_commit) для информации о том, и о том, как из этого извлечь выгоду.
как извлечь выгоду из отключённого кэша.
## disable_meta_fsync ## disable_meta_fsync
@ -180,8 +179,9 @@ SSD и HDD диски используют 4 КБ физические сект
Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment, меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
journal_block_size и meta_block_size. Однако на данный момент такие SSD journal_block_size и meta_block_size. Однако единственные SSD, которые
не известны... теоретически могут попасть в эту категорию - это Intel Optane (но и это
пока не проверялось автором).
Клиентам не обязательно знать про disk_alignment, так что помещать значение Клиентам не обязательно знать про disk_alignment, так что помещать значение
этого параметра в etcd в /vitastor/config/global не нужно. этого параметра в etcd в /vitastor/config/global не нужно.

View File

@ -55,7 +55,7 @@ Examples:
OSD placement tree is set in a separate etcd key `/vitastor/config/node_placement` OSD placement tree is set in a separate etcd key `/vitastor/config/node_placement`
in the following JSON format: in the following JSON format:
``` `
{ {
"<node name or OSD number>": { "<node name or OSD number>": {
"level": "<level>", "level": "<level>",
@ -63,7 +63,7 @@ in the following JSON format:
}, },
... ...
} }
``` `
Here, if a node name is a number then it is assumed to refer to an OSD. Here, if a node name is a number then it is assumed to refer to an OSD.
Level of the OSD is always "osd" and cannot be overriden. You may only Level of the OSD is always "osd" and cannot be overriden. You may only

View File

@ -54,7 +54,7 @@
Дерево размещения OSD задаётся в отдельном ключе etcd `/vitastor/config/node_placement` Дерево размещения OSD задаётся в отдельном ключе etcd `/vitastor/config/node_placement`
в следующем JSON-формате: в следующем JSON-формате:
``` `
{ {
"<имя узла или номер OSD>": { "<имя узла или номер OSD>": {
"level": "<уровень>", "level": "<уровень>",
@ -62,7 +62,7 @@
}, },
... ...
} }
``` `
Здесь, если название узла - число, считается, что это OSD. Уровень OSD Здесь, если название узла - число, считается, что это OSD. Уровень OSD
всегда равен "osd" и не может быть переопределён. Для OSD вы можете только всегда равен "osd" и не может быть переопределён. Для OSD вы можете только

View File

@ -97,9 +97,9 @@
it (they have internal SSD cache even though it's not stated in datasheets). it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling Setting this parameter to "all" or "small" in OSD parameters requires enabling
[disable_journal_fsync](layout-osd.en.md#disable_journal_fsync) and [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.md#disable_meta_fsync), setting it to [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.md#disable_data_fsync). "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
vitastor-disk tried to do that by default, first checking/disabling drive cache. vitastor-disk tried to do that by default, first checking/disabling drive cache.
If it can't disable drive cache, OSD get initialized with "none". If it can't disable drive cache, OSD get initialized with "none".
info_ru: | info_ru: |
@ -156,6 +156,6 @@
указано в спецификациях). указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует Указание "all" или "small" в настройках / командной строке OSD требует
включения [disable_journal_fsync](layout-osd.ru.md#disable_journal_fsync) и включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.md#disable_meta_fsync), значение "all" [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.md#disable_data_fsync). также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).

View File

@ -110,22 +110,20 @@
type: bool type: bool
default: false default: false
info: | info: |
Do not issue fsyncs to the data device, i.e. do not force it to flush cache. Do not issue fsyncs to the data device, i.e. do not flush its cache.
Safe ONLY if your data device has write-through cache or if write-back Safe ONLY if your data device has write-through cache. If you disable
cache is disabled. If you disable drive cache manually with `hdparm` or the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
writing to `/sys/.../scsi_disk/cache_type` then make sure that you do it that the cache disable command is run every time before starting Vitastor
every time before starting Vitastor OSD (vitastor-disk does it automatically). OSD, for example, in the systemd unit. See also `immediate_commit` option
See also [immediate_commit](layout-cluster.en.md#immediate_commit) for the instructions to disable cache and how to benefit from it.
for information about how to benefit from disabled cache.
info_ru: | info_ru: |
Не отправлять fsync-и устройству данных, т.е. не заставлять его сбрасывать кэш. Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
записью (write-through) или если кэш с отложенной записью (write-back) отключён. записью (write-through). Если вы отключаете кэш через `hdparm` или
Если вы отключаете кэш вручную через `hdparm` или запись в `/sys/.../scsi_disk/cache_type`, `scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
то удостоверьтесь, что вы делаете это каждый раз перед запуском Vitastor OSD выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
(vitastor-disk делает это автоматически). Смотрите также опцию Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
[immediate_commit](layout-cluster.ru.md#immediate_commit) для информации о том, и о том, как из этого извлечь выгоду.
как извлечь выгоду из отключённого кэша.
- name: disable_meta_fsync - name: disable_meta_fsync
type: bool type: bool
default: false default: false
@ -181,7 +179,8 @@
Because of this it can actually be beneficial to use SSDs which work well Because of this it can actually be beneficial to use SSDs which work well
with 512 byte sectors and use 512 byte disk_alignment, journal_block_size with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
and meta_block_size. But at the moment, no such SSDs are known... and meta_block_size. But the only SSD that may fit into this category is
Intel Optane (probably, not tested yet).
Clients don't need to be aware of disk_alignment, so it's not required to Clients don't need to be aware of disk_alignment, so it's not required to
put a modified value into etcd key /vitastor/config/global. put a modified value into etcd key /vitastor/config/global.
@ -199,8 +198,9 @@
Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment, меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
journal_block_size и meta_block_size. Однако на данный момент такие SSD journal_block_size и meta_block_size. Однако единственные SSD, которые
не известны... теоретически могут попасть в эту категорию - это Intel Optane (но и это
пока не проверялось автором).
Клиентам не обязательно знать про disk_alignment, так что помещать значение Клиентам не обязательно знать про disk_alignment, так что помещать значение
этого параметра в etcd в /vitastor/config/global не нужно. этого параметра в etcd в /vitastor/config/global не нужно.

View File

@ -4,8 +4,6 @@
[Читать на русском](opennebula.ru.md) [Читать на русском](opennebula.ru.md)
# OpenNebula
## Automatic Installation ## Automatic Installation
OpenNebula plugin is packaged as `vitastor-opennebula` Debian and RPM package since Vitastor 1.9.0. So: OpenNebula plugin is packaged as `vitastor-opennebula` Debian and RPM package since Vitastor 1.9.0. So:

View File

@ -4,8 +4,6 @@
[Read in English](opennebula.en.md) [Read in English](opennebula.en.md)
# OpenNebula
## Автоматическая установка ## Автоматическая установка
Плагин OpenNebula Vitastor распространяется как Debian и RPM пакет `vitastor-opennebula`, начиная с версии Vitastor 1.9.0. Так что: Плагин OpenNebula Vitastor распространяется как Debian и RPM пакет `vitastor-opennebula`, начиная с версии Vitastor 1.9.0. Так что:

View File

@ -6,150 +6,19 @@
# Architecture # Architecture
- [Server-side components](#server-side-components)
- [Basic concepts](#basic-concepts) - [Basic concepts](#basic-concepts)
- [Client-side components](#client-side-components)
- [Additional utilities](#additional-utilities)
- [Overall read/write process](#overall-read-write-process)
- [Nuances of request handling](#nuances-of-request-handling)
- [Similarities to Ceph](#similarities-to-ceph) - [Similarities to Ceph](#similarities-to-ceph)
- [Differences from Ceph](#differences-from-ceph) - [Differences from Ceph](#differences-from-ceph)
- [Implementation Principles](#implementation-principles) - [Implementation Principles](#implementation-principles)
## Server-side components
- **OSD** (Object Storage Daemon) is a process that directly works with the disk, stores data
and serves read/write requests. One OSD serves one disk (or one partition). OSDs talk to etcd
and to each other — they receive cluster state from etcd, and send read/write requests for
secondary copies of data to other OSDs.
- **etcd** — clustered key/value database, used as a reliable storage for configuration
and high-level cluster state. Etcd is the component that prevents splitbrain in the cluster.
Data blocks are not stored in etcd, etcd doesn't participate in data write or read path.
- **Монитор** — a separate node.js based daemon which monitors the cluster, calculates
required configuration changes and saves them to etcd, thus commanding OSDs to apply these
changes. Monitor also aggregates cluster statistics. OSD don't talk to monitor, monitor
only sends and receives data from etcd.
## Basic concepts ## Basic concepts
- **Pool** is a container for data that has equal redundancy scheme and disk placement rules. - OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
- **PG (Placement Group)** is a "shard" of the cluster, subdivision unit that has its own - PG (Placement Group) is a "shard" of the cluster, group of data stored on one set of replicas.
set of OSDs for data storage. - Pool is a container for data that has equal redundancy scheme and placement rules.
- **Failure Domain** is a group of OSDs, from the simultaneous failure of which you are - Monitor is a separate daemon that watches cluster state and handles failures.
protected by Vitastor. Default failure domain is "host" (server), but you choose a - Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
larger (for example, a rack of servers) or smaller (a single drive) failure domain - Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
for every pool.
- **Placement Tree** (similar to Ceph CRUSH Tree) groups OSDs in a hierarchy to later
split them into Failure Domains.
## Client-side components
- **Client library** encapsulates client I/O logic. Client library connects to etcd and to all OSDs,
receives cluster state from etcd, sends read and write requests directly to all OSDs. Due
to the symmetric distributed architecture, all data blocks (each 128 KB by default) are placed
to different OSDs, but clients always know where each data block is stored and connect directly
to the right OSD.
All other client-side components are based on the client library:
- **[vitastor-cli](../usage/cli.en.md)** — command-line utility for cluster management.
Allows to view cluster state, manage pools and images, i.e. create, modify and remove
virtual disks, their snapshots and clones.
- **[QEMU driver](../usage/qemu.en.md)** — pluggable QEMU module allowing QEMU/KVM virtual
machines work with virtual Vitastor disks directly from userspace through the client library,
without the need to attach disks as kernel block devices. However, if you want to attach
disks, you can also do that with the same driver and [VDUSE](../usage/qemu.en.md#vduse).
- **[vitastor-nbd](../usage/nbd.en.md)** — utility that allows to attach Vitastor disks as
kernel block devices using NBD (Network Block Device), which works more like "BUSE"
(Block Device In Userspace). Vitastor doesn't have Linux kernel modules for the same task
(at least by now). NBD is an older, non-recommended way to attach disks — you should use
VDUSE whenever you can.
- **[CSI driver](../installation/kubernetes.en.md)** — driver for attaching Vitastor images
as Kubernetes persistent volumes. Works through VDUSE (when available) or NBD — images are
attached as kernel block devices and mounted into containers.
- **Drivers for Proxmox, OpenStack and so on** — pluggable modules for corresponding systems,
allowing to use Vitastor as storage in them.
- **[vitastor-nfs](../usage/nfs.en.md)** — NFS 3.0 server allowing export of two file system variants:
the first is a simplified pseudo-FS for file-based access to Vitastor block images (for non-QEMU
hypervisors with NFS support), the second is **VitastorFS**, full-featured clustered POSIX FS.
Both variants support parallel access from multiple vitastor-nfs servers. In fact, you are
not required to setup separate NFS servers at all and use vitastor-nfs mount command on every
client node — it starts the NFS server and mounts the FS locally.
- **[fio driver](../usage/fio.en.md)** — pluggable module for fio disk benchmarking tool for
running performance tests on your Vitastor cluster.
- **vitastor-kv** — client for a key-value DB working over shared block volumes (usual
vitastor images). VitastorFS metadata is stored in vitastor-kv.
## Additional utilities
- **vitastor-disk** — a Vitastor OSD disk management tool. You can create, remove,
resize and move OSD partitions with it.
## Overall read/write process
- Vitastor stores virtual disks, also named "images" or "inodes".
- Each image is stored in some pool. Pool specifies storage parameters such as redundancy
scheme (replication or EC — erasure codes, i.e. error correction codes), failure domain
and restrictions on OSD selection for image data placement. See [Pool configuration](../config/pool.en.md) for details.
- Each image is split into objects/blocks of fixed size, equal to [block_size](../config/layout-cluster.en.md#block_size)
(128 KB by default), multiplied by data part count for EC or 1 for replicas. That is,
if a pool uses EC 4+2 coding scheme (4 data parts + 2 parity parts), then, with the
default block_size, images are split into 512 KB objects.
- Client read/write requests are split into parts at object boundaries.
- Each object is mapped to a PG number it belongs to, by simply taking a remainder of
division of its offset by PG count of the image's pool.
- Client reads primary OSD for all PGs from etcd. Primary OSD for each PG is assigned
by the monitor during cluster operation, along with the full PG OSD set.
- If not already connected, client connects to primary OSDs of all PGs involved in a
read/write request and sends parts of the request to them.
- If a primary OSD is unavailable, client retries connection attempts indefinitely
either until it becomes available or until the monitor assigns another OSD as primary
for that PG.
- Client also retries requests if the primary OSD replies with error code EPIPE, meaning
that the PG is inactive at this OSD at the moment - for example, when the primary OSD
is switched, or if the primary OSD itself loses connection to replicas during request
handling.
- Primary OSD determines where the parts of the object are stored. By default, all objects
are assumed to be stored at the target OSD set of a PG, but some of them may be present
at a different OSD set if they are degraded or moved, or if the data rebalancing process
is active. OSDs doesn't do any network requests, if calculates locations of all objects
during PG activation and stores it in memory.
- Primary OSD handles the request locally when it can - for example, when it's a read
from a replicated pool or when it's a read from a EC pool involving only one data part
stored on the OSD's local disk.
- When a request requires reads or writes to additional OSDs, primary OSD uses already
established connections to secondary OSDs of the PG to execute these requests. This happens
in parallel to local disk operations. All such connections are guaranteed to be already
established when the PG is active, and if any of them is dropped, PG is restarted and
all current read/write operations to it fail with EPIPE error and are retried by clients.
- After completing all secondary read/write requests, primary OSD sends the response to
the client.
### Nuances of request handling
- If a pool uses erasure codes and some of the OSDs are unavailable, primary OSDs recover
data from the remaining parts during read.
- Each object has a version number. During write, primary OSD first determines the current
version of the object. As primary OSD usually stores the object or its part itself, most
of the time version is read from the memory of the OSD itself. However, if primary OSD
doesn't contain parts of the object, it requests the version number from a secondary OSD
which has that part. Such request still doesn't involve reading from the disk though,
because object metadata, including version number, is always stored in OSD memory.
- If a pool uses erasure codes, partial writes of an object require reading other parts of
it from secondary OSDs or from the local disk of the primary OSD itself. This is called
"read-modify-write" process.
- If a pool uses erasure codes, two-phase write process is used to get rid of the Write Hole
problem: first a new version of object parts is written to all secondary OSDs without
removing the previous version, and then, after receiving successful write confirmations
from all OSDs, new version is committed and the old one is allowed to be removed.
- In a pool doesn't use immediate_commit mode, then write requests sent by clients aren't
treated as committed to physical media instantly. Clients have to send separate type of
requests (SYNC) to commit changes, and before it isn't sent, new versions of data are
allowed to be lost if some OSDs die. Thus, when immediate_commit is disabled, clients
store copies of all write requests in memory and repeat them from there when the
connection to primary OSD is lost. This in-memory copy is removed after a successful
SYNC, and to prevent excessive memory usage, clients also do an automatic SYNC
every [client_dirty_limit](../config/network.en.md#client_dirty_limit) written bytes.
## Similarities to Ceph ## Similarities to Ceph

View File

@ -11,7 +11,6 @@
- [Серверные компоненты](#серверные-компоненты) - [Серверные компоненты](#серверные-компоненты)
- [Базовые понятия](#базовые-понятия) - [Базовые понятия](#базовые-понятия)
- [Клиентские компоненты](#клиентские-компоненты) - [Клиентские компоненты](#клиентские-компоненты)
- [Дополнительные утилиты](#дополнительные-утилиты)
- [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения) - [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения)
- [Особенности обработки запросов](#особенности-обработки-запросов) - [Особенности обработки запросов](#особенности-обработки-запросов)
- [Схожесть с Ceph](#схожесть-с-ceph) - [Схожесть с Ceph](#схожесть-с-ceph)
@ -35,9 +34,8 @@
- **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD. - **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD.
- **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор - **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор
OSD для хранения данных (копий или частей объектов). OSD для хранения данных (копий или частей объектов).
- **Домен отказа (Failure Domain)** — группа OSD, от одновременного падения которых должен защищать - **Домен отказа (Failure Domain)** — группа OSD, одновременное падение которых рассматривается
Vitastor. По умолчанию домен отказа — "host" (сервер), но вы можете установить для пула как больший как вероятное. По умолчанию это "host" (сервер).
домен отказа (например, стойку серверов), так и меньший (например, отдельный диск).
- **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD - **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD
в узлы, которые далее можно использовать как домены отказа. в узлы, которые далее можно использовать как домены отказа.
@ -51,39 +49,25 @@
На базе клиентской библиотеки реализованы все остальные клиенты: На базе клиентской библиотеки реализованы все остальные клиенты:
- **[vitastor-cli](../usage/cli.ru.md)** — утилита командной строки для управления кластером. - **vitastor-cli** — утилита командной строки для управления кластером. В данный момент позволяет
Позволяет просматривать общее состояние кластера, управлять пулами и образами — то есть просматривать общее состояние кластера и управлять образами — т.е. создавать, менять и удалять
создавать, менять и удалять виртуальные диски, их снимки и клоны. виртуальные диски, их снимки и клоны.
- **[Драйвер QEMU](../usage/qemu.ru.md)** — подключаемый модуль QEMU, позволяющий QEMU/KVM - **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
виртуальным машинам работать с виртуальными дисками Vitastor напрямую из пространства пользователя с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
с помощью клиентской библиотеки, без необходимости подключения дисков в виде блочных устройств библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
Linux. Если, однако, вы хотите подключать диски в виде блочных устройств, то вы тоже можете позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
сделать это с помощью того же самого драйвера и [VDUSE](../usage/qemu.ru.md#vduse). - **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
- **[vitastor-nbd](../usage/nbd.ru.md)** — утилита, позволяющая монтировать образы Vitastor с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
в виде блочных устройств с помощью NBD (Network Block Device), на самом деле скорее работающего (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
как "BUSE" (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в (по крайней мере, пока).
Vitastor нет (по крайней мере, пока). NBD — более старый и нерекомендуемый способ подключения - **CSI драйвер** — драйвер для подключения Vitastor-образов в виде персистентных томов (PV) Kubernetes.
дисков — вам следует использовать VDUSE всегда, когда это возможно. Работает через vitastor-nbd — образы отражаются в виде блочных устройств и монтируются
- **[CSI драйвер](../installation/kubernetes.ru.md)** — драйвер для подключения Vitastor-образов в контейнеры.
в виде персистентных томов (PV) Kubernetes. Работает через VDUSE (если доступно) или через
NBD — образы отражаются в виде блочных устройств и монтируются в контейнеры.
- **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем, - **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем,
позволяющие использовать Vitastor как хранилище в оных. позволяющие использовать Vitastor как хранилище в оных.
- **[vitastor-nfs](../usage/nfs.ru.md)** — NFS 3.0 сервер, предоставляющий два варианта файловой системы: - **vitastor-nfs** — утилита, предоставляющая файловый доступ к образам в кластере Vitastor
первая — упрощённая для файлового доступа к блочным образам (для не-QEMU гипервизоров, поддерживающих NFS), по протоколу NFS 3.0. Предназначена для гипервизоров, не основанных на QEMU и Linux, но при
вторая — VitastorFS, полноценная кластерная POSIX ФС. Оба варианта поддерживают параллельный этом поддерживающих NFS.
доступ с нескольких vitastor-nfs серверов. На самом деле можно вообще не выделять
отдельные NFS-серверы, а вместо этого использовать команду vitastor-nfs mount, запускающую
NFS-сервер прямо на клиентской машине и монтирующую ФС локально.
- **[Драйвер fio](../usage/fio.ru.md)** — подключаемый модуль для утилиты тестирования
производительности дисков fio, позволяющий тестировать Vitastor-кластеры.
- **vitastor-kv** — клиент для key-value базы данных, работающей поверх разделяемого блочного
образа (обычного блочного образа vitastor). Метаданные VitastorFS хранятся именно в vitastor-kv.
## Дополнительные утилиты
- **vitastor-disk** — утилита для разметки дисков под Vitastor OSD. С её помощью можно
создавать, удалять, менять размеры или перемещать разделы OSD.
## Общий процесс записи и чтения ## Общий процесс записи и чтения
@ -114,22 +98,16 @@
находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс
ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех
объектов рассчитывается первичным OSD при активации PG и хранится в памяти. объектов рассчитывается первичным OSD при активации PG и хранится в памяти.
- Когда это возможно, первичный OSD обрабатывает запрос локально. Например, так происходит - Первичный OSD соединяется (если ещё не соединён) с вторичными OSD, на которых располагаются
при чтениях объектов из пулов с репликацией или при чтении из EC пула, затрагивающего части объекта, и отправляет им запросы чтения/записи, а также читает/пишет из/в своё локальное
только часть, хранимую на диске самого первичного OSD. хранилище, если сам входит в набор.
- Когда запрос требует записи или чтения с вторичных OSD, первичный OSD использует заранее
установленные соединения с ними для выполнения этих запросов. Это происходит параллельно
локальным операциям чтения/записи с диска самого OSD. Так как соединения к вторичным OSD PG
устанавливаются при её запуске, то они уже гарантированно установлены, когда PG активна,
и если любое из этих соединений отключается, PG перезапускается, а все текущие запросы чтения
и записи в неё завершаются с ошибкой EPIPE, после чего повторяются клиентами.
- После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту. - После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.
### Особенности обработки запросов ### Особенности обработки запросов
- Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный - Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный
OSD при чтении восстанавливает данные из оставшихся частей. OSD при чтении восстанавливает данные из оставшихся частей.
- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала получает номер - Каждый объект имеет номер версии. При записи объекта первичный OSD сначала читает из номер
версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер
версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта
не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных
@ -137,20 +115,20 @@
так как метаданные объектов, включая номер версии, все OSD хранят в памяти. так как метаданные объектов, включая номер версии, все OSD хранят в памяти.
- Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления - Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления
чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска
самого первичного OSD. Это называется процессом "чтение-модификация-запись" (read-modify-write). самого первичного OSD.
- Если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется - Также, если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей
объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения
успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой. успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой.
- Если в пуле не включён режим immediate_commit, то запросы записи, отправляемые клиентами, - Если в кластере не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты
должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса), должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса),
а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть, а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть,
если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все
запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении
с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном SYNC, с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном fsync,
а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты
автоматически выполняют SYNC каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit) автоматически выполняют fsync каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
записанных байт. записанных байт.
## Схожесть с Ceph ## Схожесть с Ceph

View File

@ -32,7 +32,7 @@
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M - SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743, - NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS - HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
## Configure monitors ## Configure monitors

View File

@ -22,7 +22,7 @@
использовать и десктопные SSD, включив режим отложенного fsync, но производительность будет хуже. использовать и десктопные SSD, включив режим отложенного fsync, но производительность будет хуже.
О конденсаторах читайте [здесь](../config/layout-cluster.ru.md#immediate_commit). О конденсаторах читайте [здесь](../config/layout-cluster.ru.md#immediate_commit).
- Если хотите использовать HDD, берите современные модели с Media или SSD кэшем - HGST Ultrastar, - Если хотите использовать HDD, берите современные модели с Media или SSD кэшем - HGST Ultrastar,
Toshiba MG, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет, Toshiba MG08, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места). обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места).
- Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2. - Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2.
- Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`. - Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
@ -32,8 +32,8 @@
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M - SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743, - NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS - HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
## Настройте мониторы ## Настройте мониторы

View File

@ -171,14 +171,7 @@ to make them use the new version of the client library.
### 1.7.x to 1.8.0 ### 1.7.x to 1.8.0
It's recommended to upgrade from version <= 1.7.x to version >= 1.8.0 with full downtime, After upgrading version <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
i.e. you should first stop clients and then the cluster (OSDs and monitor), because 1.8.0
includes a fix for etcd event stream inconsistency which could lead to "incomplete" objects
appearing in EC pools, and in rare cases, probably, even to data corruption during mass OSD
restarts. It doesn't mean that you WILL hit this problem if you upgrade without full downtime,
but it's better to secure yourself against it.
Also, if you upgrade version from <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
(VMs and so on), otherwise they will hang when monitor clears old PG configuration key, (VMs and so on), otherwise they will hang when monitor clears old PG configuration key,
which happens 24 hours after upgrade. which happens 24 hours after upgrade.

View File

@ -168,14 +168,7 @@ done
### 1.7.x -> 1.8.0 ### 1.7.x -> 1.8.0
Обновляться с версий <= 1.7.x до версий >= 1.8.0 рекомендуется с полной остановкой После обновления с версий <= 1.7.x до версий >= 1.8.0, НО <= 1.9.0: перезапустите всех
сначала клиентов, а затем кластера, так как в 1.8.0 исправлена проблема (неконсистентность
потоков событий от etcd), способная приводить к появлению incomplete объектов в EC-пулах
и, хоть и редко, но даже к повреждению данных при массовых перезапусках OSD. Если вы
обновляетесь без полной остановки - это не значит, что вы обязательно столкнётесь с этой
проблемой, но лучше подстраховаться.
Также, если вы обновляетесь с версии <= 1.7.x до версии >= 1.8.0, НО <= 1.9.0: перезапустите всех
клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер), клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер),
иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через
24 часа после обновления. 24 часа после обновления.

View File

@ -13,7 +13,6 @@ It supports the following commands:
- [prepare](#prepare) - [prepare](#prepare)
- [upgrade-simple](#upgrade-simple) - [upgrade-simple](#upgrade-simple)
- [resize](#resize) - [resize](#resize)
- [raw-resize](#raw-resize)
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable) - [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
- [purge](#purge) - [purge](#purge)
- [read-sb](#read-sb) - [read-sb](#read-sb)
@ -51,16 +50,12 @@ Options (automatic mode):
--osd_per_disk <N> --osd_per_disk <N>
Create <N> OSDs on each disk (default 1) Create <N> OSDs on each disk (default 1)
--hybrid --hybrid
Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default, Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for
any passed SSDs will be used for journals and metadata, HDDs will be used for data, journals and metadata, HDDs will be used for data. Partitions for journals and
but you can override this behaviour with --fast-devices option. Journal and metadata metadata will be created automatically. Whether disks are SSD or HDD is decided
partitions will be created automatically. In the default mode, SSD and HDD disks by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object
are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,
for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal and throttle_small_writes is enabled by default.
size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.
--fast-devices /dev/nvmeX,/dev/nvmeY
In --hybrid mode, use these devices for journal and metadata instead of auto-detecting
and extracting them from the main [devices...] list.
--disable_data_fsync auto --disable_data_fsync auto
Disable data device cache and fsync (1/yes/true = on, default auto) Disable data device cache and fsync (1/yes/true = on, default auto)
--disable_meta_fsync auto --disable_meta_fsync auto
@ -132,49 +127,25 @@ Requires the `sfdisk` utility.
## resize ## resize
`vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]` `vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
Resize data area and/or move journal and metadata: Resize data area and/or rewrite/move journal and metadata.
| <!-- --> | <!-- --> |
|---------------------------|----------------------------------------|
| `--move-journal TARGET` | move journal to `TARGET` |
| `--move-meta TARGET` | move metadata to `TARGET` |
| `--journal-size NEW_SIZE` | resize journal to `NEW_SIZE` |
| `--data-size NEW_SIZE` | resize data device to `NEW_SIZE` |
| `--dry-run` | only show new layout, do not apply it |
`NEW_SIZE` may include k/m/g/t suffixes.
`TARGET` may be one of:
| <!-- --> | <!-- --> |
|----------------|--------------------------------------------------------------------------|
| `<partition>` | move journal/metadata to an existing GPT partition |
| `<raw_device>` | create a GPT partition on `<raw_device>` and move journal/metadata to it |
| `""` | (empty string) move journal/metadata back to the data device |
## raw-resize
`vitastor-disk raw-resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
Resize data area and/or rewrite/move journal and metadata (manual format).
`ALL_OSD_PARAMETERS` must include all (at least all disk-related) `ALL_OSD_PARAMETERS` must include all (at least all disk-related)
parameters from OSD command line (i.e. from systemd unit or superblock). parameters from OSD command line (i.e. from systemd unit or superblock).
`NEW_LAYOUT` may include new disk layout parameters: `NEW_LAYOUT` may include new disk layout parameters:
| <!-- --> | <!-- --> | ```
|-----------------------------|-------------------------------------------| --new_data_offset SIZE resize data area so it starts at SIZE
| `--new_data_offset SIZE` | resize data area so it starts at `SIZE` | --new_data_len SIZE resize data area to SIZE bytes
| `--new_data_len SIZE` | resize data area to `SIZE` bytes | --new_meta_device PATH use PATH for new metadata
| `--new_meta_device PATH` | use `PATH` for new metadata | --new_meta_offset SIZE make new metadata area start at SIZE
| `--new_meta_offset SIZE` | make new metadata area start at `SIZE` | --new_meta_len SIZE make new metadata area SIZE bytes long
| `--new_meta_len SIZE` | make new metadata area `SIZE` bytes long | --new_journal_device PATH use PATH for new journal
| `--new_journal_device PATH` | use `PATH` for new journal | --new_journal_offset SIZE make new journal area start at SIZE
| `--new_journal_offset SIZE` | make new journal area start at `SIZE` | --new_journal_len SIZE make new journal area SIZE bytes long
| `--new_journal_len SIZE` | make new journal area `SIZE` bytes long | ```
SIZE may include k/m/g/t suffixes. If any of the new layout parameter SIZE may include k/m/g/t suffixes. If any of the new layout parameter
options are not specified, old values will be used. options are not specified, old values will be used.
@ -246,14 +217,10 @@ Intended for use from startup scripts (i.e. from systemd units).
## dump-journal ## dump-journal
`vitastor-disk dump-journal [OPTIONS] <osd_device>`
`vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>` `vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>`
Dump journal in human-readable or JSON (if `--json` is specified) format. Dump journal in human-readable or JSON (if `--json` is specified) format.
You can specify any OSD device (data, metadata or journal), or the layout manually.
Options: Options:
``` ```
@ -266,35 +233,23 @@ Options:
## write-journal ## write-journal
`vitastor-disk write-journal <osd_device>`
`vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>` `vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>`
Write journal from JSON taken from standard input in the same format as produced by Write journal from JSON taken from standard input in the same format as produced by
`dump-journal --json --format data`. `dump-journal --json --format data`.
You can specify any OSD device (data, metadata or journal), or the layout manually.
## dump-meta ## dump-meta
`vitastor-disk dump-meta <osd_device>`
`vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>` `vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>`
Dump metadata in JSON format. Dump metadata in JSON format.
You can specify any OSD device (data, metadata or journal), or the layout manually.
## write-meta ## write-meta
`vitastor-disk write-meta <osd_device>`
`vitastor-disk write-meta <meta_file> <offset> <size>` `vitastor-disk write-meta <meta_file> <offset> <size>`
Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`. Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`.
You can specify any OSD device (data, metadata or journal), or the layout manually.
## simple-offsets ## simple-offsets
`vitastor-disk simple-offsets <device>` `vitastor-disk simple-offsets <device>`

View File

@ -13,7 +13,6 @@ vitastor-disk - инструмент командной строки для уп
- [prepare](#prepare) - [prepare](#prepare)
- [upgrade-simple](#upgrade-simple) - [upgrade-simple](#upgrade-simple)
- [resize](#resize) - [resize](#resize)
- [raw-resize](#raw-resize)
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable) - [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
- [purge](#purge) - [purge](#purge)
- [read-sb](#read-sb) - [read-sb](#read-sb)
@ -51,17 +50,12 @@ vitastor-disk - инструмент командной строки для уп
--osd_per_disk <N> --osd_per_disk <N>
Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1) Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1)
--hybrid --hybrid
Инициализировать гибридные (HDD+SSD, NVMe+SATA и т.п.) OSD на указанных дисках. Инициализировать гибридные (HDD+SSD) OSD на указанных дисках. SSD будут
По умолчанию, SSD будут использованы для журналов и метаданных, а HDD - для данных, использованы для журналов и метаданных, а HDD - для данных. Разделы для журналов
но вы можете поменять это поведение опцией --fast-devices. Разделы для журналов и метаданных будут созданы автоматически. Является ли диск SSD или HDD, определяется
и метаданных будут созданы автоматически. В режиме по умолчанию SSD и HDD-диски по флагу `/sys/block/.../queue/rotational`. В гибридном режиме по умолчанию
различаются по флагу `/sys/block/.../queue/rotational`. Когда в гибридном режиме используется размер объекта 1 МБ вместо 128 КБ, размер журнала 1 ГБ вместо 32 МБ
для данных используются HDD, по умолчанию размер блока устанавливается 1 МБ вместо и включённый throttle_small_writes.
128 КБ, размер журнала 1 ГБ вместо 32 МБ, и throttle_small_writes включается по
умолчанию.
--fast-devices /dev/nvmeX,/dev/nvmeY
Использовать данные диски для журналов и метаданных в гибридном режиме вместо их
автоопределения и извлечения из основного списка [devices...].
--disable_data_fsync auto --disable_data_fsync auto
Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение) Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение)
--disable_meta_fsync auto --disable_meta_fsync auto
@ -135,51 +129,27 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
## resize ## resize
`vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]` `vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
Изменить размер области данных и/или переместить журнал и метаданные: Изменить размер области данных и/или переместить журнал и метаданные.
| <!-- --> | <!-- --> | В `ALL_OSD_PARAMETERS` нужно указать все относящиеся к диску параметры OSD
|-------------------------------|------------------------------------------------|
| `--move-journal ЦЕЛЬ` | переместить журнал на `ЦЕЛЬ` |
| `--move-meta ЦЕЛЬ` | переместить метаданные на `ЦЕЛЬ` |
| `--journal-size НОВЫЙ_РАЗМЕР` | изменить размер журнала на `НОВЫЙ_РАЗМЕР` |
| `--data-size НОВЫЙ_РАЗМЕР` | изменить размер диска данных на `НОВЫЙ_РАЗМЕР` |
| `--dry-run` | показать новые параметры, но не применять их |
`НОВЫЙ_РАЗМЕР` может быть указан с суффиксами k/m/g/t (кило/мега/гига/терабайт).
`ЦЕЛЬ` может быть одним из:
| <!-- --> | <!-- --> |
|-----------------|-------------------------------------------------------------------------------------|
| `<раздел>` | переместить журнал/метаданные на существующий GPT-раздел |
| `<полный_диск>` | создать GPT-раздел на диске `<полный_диск>` и переместить журнал/метаданные на него |
| `""` | (пустая строка) переместить журнал/метаданные обратно на диск данных |
## raw-resize
`vitastor-disk raw-resize <ВСЕАРАМЕТРЫ_OSD> <НОВЫЕ_РАЗМЕРЫ> [--iodepth 32]`
Изменить размер области данных и/или переместить журнал и метаданные (ручной формат).
В `ВСЕАРАМЕТРЫ_OSD` нужно указать все относящиеся к диску параметры OSD
из суперблока OSD или из файла сервиса systemd (в старых версиях). из суперблока OSD или из файла сервиса systemd (в старых версиях).
В `НОВЫЕ_РАЗМЕРЫ` нужно указать новые параметры расположения данных: В `NEW_LAYOUT` нужно указать новые параметры расположения данных:
| <!-- --> | <!-- --> | ```
|-------------------------------|-------------------------------------------------------| --new_data_offset РАЗМЕР сдвинуть начало области данных на РАЗМЕР байт
| `--new_data_offset РАЗМЕР` | сдвинуть начало области данных на `РАЗМЕР` байт | --new_data_len РАЗМЕР изменить размер области данных до РАЗМЕР байт
| `--new_data_len РАЗМЕР` | изменить размер области данных до `РАЗМЕР` байт | --new_meta_device ПУТЬ использовать ПУТЬ как новое устройство метаданных
| `--new_meta_device ПУТЬ` | использовать `ПУТЬ` как новое устройство метаданных | --new_meta_offset РАЗМЕР разместить новые метаданные по смещению РАЗМЕР байт
| `--new_meta_offset РАЗМЕР` | разместить новые метаданные по смещению `РАЗМЕР` байт | --new_meta_len РАЗМЕР сделать новые метаданные размером РАЗМЕР байт
| `--new_meta_len РАЗМЕР` | сделать новые метаданные размером `РАЗМЕР` байт | --new_journal_device ПУТЬ использовать ПУТЬ как новое устройство журнала
| `--new_journal_device ПУТЬ` | использовать `ПУТЬ` как новое устройство журнала | --new_journal_offset РАЗМЕР разместить новый журнал по смещению РАЗМЕР байт
| `--new_journal_offset РАЗМЕР` | разместить новый журнал по смещению `РАЗМЕР` байт | --new_journal_len РАЗМЕР сделать новый журнал размером РАЗМЕР байт
| `--new_journal_len РАЗМЕР` | сделать новый журнал размером `РАЗМЕР` байт | ```
`РАЗМЕР` может быть указан с суффиксами k/m/g/t. Если любой из новых параметров РАЗМЕР может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
расположения не указан, он принимается равным старому значению. расположения не указан, он принимается равным старому значению.
## start/stop/restart/enable/disable ## start/stop/restart/enable/disable
@ -254,15 +224,10 @@ OSD отключены fsync-и.
## dump-journal ## dump-journal
`vitastor-disk dump-journal <osd_device>`
`vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>` `vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>`
Вывести журнал в человекочитаемом или в JSON (с опцией `--json`) виде. Вывести журнал в человекочитаемом или в JSON (с опцией `--json`) виде.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
Опции: Опции:
``` ```
@ -275,37 +240,22 @@ OSD отключены fsync-и.
## write-journal ## write-journal
`vitastor-disk write-journal <osd_device>`
`vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>` `vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>`
Записать журнал из JSON со стандартного ввода в формате, аналогичном `dump-journal --json --format data`. Записать журнал из JSON со стандартного ввода в формате, аналогичном `dump-journal --json --format data`.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
## dump-meta ## dump-meta
`vitastor-disk dump-meta <osd_device>`
`vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>` `vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>`
Вывести метаданные в формате JSON. Вывести метаданные в формате JSON.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
## write-meta ## write-meta
`vitastor-disk write-meta <osd_device>`
`vitastor-disk write-meta <meta_file> <offset> <size>` `vitastor-disk write-meta <meta_file> <offset> <size>`
Записать метаданные из JSON со стандартного ввода в формате, аналогичном `dump-meta`. Записать метаданные из JSON со стандартного ввода в формате, аналогичном `dump-meta`.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
## simple-offsets ## simple-offsets
`vitastor-disk simple-offsets <device>` `vitastor-disk simple-offsets <device>`

View File

@ -156,17 +156,17 @@ behind. Defragmentation removes garbage and moves data still in use to new volum
Options: Options:
| <!-- --> | <!-- --> | | <!-- --> | <!-- --> |
|----------------------------|------------------------------------------------------------------------ | |--------------------------|------------------------------------------------------------------------ |
| `--volume_untouched 86400` | Defragment volumes last appended to at least this number of seconds ago | | --volume_untouched 86400 | Defragment volumes last appended to at least this number of seconds ago |
| `--defrag_percent 50` | Defragment volumes with at least this % of removed data | | --defrag_percent 50 | Defragment volumes with at least this % of removed data |
| `--defrag_block_count 16` | Read this number of pool blocks at once during defrag | | --defrag_block_count 16 | Read this number of pool blocks at once during defrag |
| `--defrag_iodepth 16` | Move up to this number of files in parallel during defrag | | --defrag_iodepth 16 | Move up to this number of files in parallel during defrag |
| `--trace` | Print verbose defragmentation status | | --trace | Print verbose defragmentation status |
| `--dry-run` | Skip modifications, only print status | | --dry-run | Skip modifications, only print status |
| `--recalc-stats` | Recalculate all volume statistics | | --recalc-stats | Recalculate all volume statistics |
| `--include-empty` | Include old and empty volumes; make sure to restart NFS servers before using it | | --include-empty | Include old and empty volumes; make sure to restart NFS servers before using it |
| `--no-rm` | Move, but do not delete data | | --no-rm | Move, but do not delete data |
## Common options ## Common options

View File

@ -164,17 +164,17 @@ JSON-формате :-). Для инспекции содержимого БД
Опции: Опции:
| <!-- --> | <!-- --> | | <!-- --> | <!-- --> |
|----------------------------|------------------------------------------------------------------------ | |--------------------------|------------------------------------------------------------------------ |
| `--volume_untouched 86400` | Дефрагментировать только тома, в которые уже не писали это число секунд | | --volume_untouched 86400 | Дефрагментировать только тома, в которые уже не писали это число секунд |
| `--defrag_percent 50` | Дефрагментировать только тома, в которых этот % данных удалён | | --defrag_percent 50 | Дефрагментировать только тома, в которых этот % данных удалён |
| `--defrag_block_count 16` | Читать это количество блоков пула за один раз | | --defrag_block_count 16 | Читать это количество блоков пула за один раз |
| `--defrag_iodepth 16` | Перемещать одновременно до этого числа файлов | | --defrag_iodepth 16 | Перемещать одновременно до этого числа файлов |
| `--trace` | Печатать детальную статистику дефрагментации | | --trace | Печатать детальную статистику дефрагментации |
| `--dry-run` | Не производить никаких изменений, только описать выполняемые действия | | --dry-run | Не производить никаких изменений, только описать выполняемые действия |
| `--recalc-stats` | Пересчитать и сохранить статистику всех томов | | --recalc-stats | Пересчитать и сохранить статистику всех томов |
| `--include-empty` | Дефрагментировать старые и пустые тома; обязательно перезапустите NFS-сервера после использования этой опции | | --include-empty | Дефрагментировать старые и пустые тома; обязательно перезапустите NFS-сервера после использования этой опции |
| `--no-rm` | Перемещать, но не удалять данные | | --no-rm | Перемещать, но не удалять данные |
## Общие опции ## Общие опции

View File

@ -151,9 +151,9 @@ Example performance comparison:
To try VDUSE you need at least Linux 5.15, built with VDUSE support To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m). (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
Debian Linux kernels had these options disabled until 6.6, so make sure you install a newer kernel Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
(from bookworm-backports, trixie or newer Debian version) if you want to try VDUSE. You can also use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
build modules for an existing kernel manually: or build modules for Debian kernel manually:
``` ```
mkdir build mkdir build

View File

@ -154,9 +154,9 @@ VDUSE - на данный момент лучший интерфейс для п
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m). VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
В ядрах в Debian Linux эти опции включены, только начиная с 6.6, так что установите свежее ядро В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
из bookworm-backports, trixie или из более новой версии Debian, если хотите попробовать VDUSE. на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
Либо же вы можете самостоятельно собрать модули для установленного ядра: из Proxmox или соберите модули для ядра Debian вручную:
``` ```
mkdir build mkdir build

View File

@ -567,7 +567,6 @@ class Mon
async apply_pool_pgs(results, up_osds, osd_tree, tree_hash) async apply_pool_pgs(results, up_osds, osd_tree, tree_hash)
{ {
const etcd_request = { compare: [], success: [] };
for (const pool_id in (this.state.pg.config||{}).items||{}) for (const pool_id in (this.state.pg.config||{}).items||{})
{ {
// We should stop all PGs when deleting a pool or changing its PG count // We should stop all PGs when deleting a pool or changing its PG count
@ -580,24 +579,9 @@ class Mon
return false; return false;
} }
} }
if (!this.state.config.pools[pool_id])
{
// Delete PG history and stats of the deleted pool
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.config.etcd_prefix+'/pg/history/'+pool_id+'/'),
range_end: b64(this.config.etcd_prefix+'/pg/history/'+pool_id+'0'),
} });
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'/'),
range_end: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'0'),
} });
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'/'),
range_end: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'0'),
} });
}
} }
const new_pg_config = JSON.parse(JSON.stringify(this.state.pg.config)); const new_pg_config = JSON.parse(JSON.stringify(this.state.pg.config));
const etcd_request = { compare: [], success: [] };
for (const pool_id in (new_pg_config||{}).items||{}) for (const pool_id in (new_pg_config||{}).items||{})
{ {
if (!this.state.config.pools[pool_id]) if (!this.state.config.pools[pool_id])

View File

@ -1,6 +1,6 @@
{ {
"name": "vitastor-mon", "name": "vitastor-mon",
"version": "1.9.3", "version": "1.9.1",
"description": "Vitastor SDS monitor service", "description": "Vitastor SDS monitor service",
"main": "mon-main.js", "main": "mon-main.js",
"scripts": { "scripts": {

View File

@ -3,9 +3,7 @@
set -e set -e
reapply_patch() { reapply_patch() {
if ! [[ -e $1 ]]; then if ! patch -f --dry-run -F 0 -R $1 < $2 >/dev/null; then
echo "$1 does not exist, OpenNebula is not installed"
elif ! patch -f --dry-run -F 0 -R $1 < $2 >/dev/null; then
already_applied=0 already_applied=0
if ! patch --no-backup-if-mismatch -r - -F 0 -f $1 < $2; then if ! patch --no-backup-if-mismatch -r - -F 0 -f $1 < $2; then
applied_ok=0 applied_ok=0
@ -17,13 +15,8 @@ echo "Reapplying Vitastor patches to OpenNebula's oned.conf, vmm_execrc and down
already_applied=1 already_applied=1
applied_ok=1 applied_ok=1
reapply_patch /var/lib/one/remotes/datastore/downloader.sh /var/lib/one/remotes/datastore/vitastor/downloader-vitastor.sh.diff reapply_patch /var/lib/one/remotes/datastore/downloader.sh /var/lib/one/remotes/datastore/vitastor/downloader-vitastor.sh.diff
reapply_patch /etc/one/oned.conf /var/lib/one/remotes/datastore/vitastor/oned.conf.diff
reapply_patch /etc/one/vmm_exec/vmm_execrc /var/lib/one/remotes/datastore/vitastor/vmm_execrc.diff reapply_patch /etc/one/vmm_exec/vmm_execrc /var/lib/one/remotes/datastore/vitastor/vmm_execrc.diff
if [[ -e /etc/one/oned.conf ]]; then
if ! /var/lib/one/remotes/datastore/vitastor/patch-oned-conf.py /etc/one/oned.conf; then
applied_ok=0
already_applied=0
fi
fi
if [[ "$already_applied" = 1 ]]; then if [[ "$already_applied" = 1 ]]; then
echo "OK: Vitastor OpenNebula patches are already applied" echo "OK: Vitastor OpenNebula patches are already applied"
elif [[ "$applied_ok" = 1 ]]; then elif [[ "$applied_ok" = 1 ]]; then

View File

@ -1,115 +0,0 @@
#!/usr/bin/env python3
# Patch /etc/one/oned.conf for Vitastor support
# -s = also enable save.vitastor/restore.vitastor overrides
import re
import os
import sys
class Fixer:
save_restore = 0
def require_sub_cb(self, m, cb):
self.found = 1
return cb(m)
def require_sub(self, regexp, cb, text, error):
self.found = 0
new_text = re.sub(regexp, lambda m: self.require_sub_cb(m, cb), text)
if not self.found and error:
self.errors.append(error)
return new_text
def fix(self, oned_conf):
self.errors = []
self.kvm_found = 0
oned_conf = self.require_sub(r'((?:^|\n)[ \t]*VM_MAD\s*=\s*\[)([^\]]+)\]', lambda m: m.group(1)+self.fix_vm_mad(m.group(2))+']', oned_conf, 'VM_MAD not found')
if not self.kvm_found:
self.errors.append("VM_MAD[NAME=kvm].ARGUMENTS not found")
oned_conf = self.require_sub(r'((?:^|\n)[ \t]*TM_MAD\s*=\s*\[)([^\]]+)\]', lambda m: m.group(1)+self.fix_tm_mad(m.group(2))+']', oned_conf, 'TM_MAD not found')
oned_conf = self.require_sub(r'((?:^|\n)[ \t]*DATASTORE_MAD\s*=\s*\[)([^\]]+)\]', lambda m: m.group(1)+self.fix_datastore_mad(m.group(2))+']', oned_conf, 'DATASTORE_MAD not found')
if oned_conf[-1:] != '\n':
oned_conf += '\n'
if not re.compile(r'(^|\n)[ \t]*INHERIT_DATASTORE_ATTR\s*=\s*"VITASTOR_CONF"').search(oned_conf):
oned_conf += '\nINHERIT_DATASTORE_ATTR="VITASTOR_CONF"\n'
if not re.compile(r'(^|\n)[ \t]*INHERIT_DATASTORE_ATTR\s*=\s*"IMAGE_PREFIX"').search(oned_conf):
oned_conf += '\nINHERIT_DATASTORE_ATTR="IMAGE_PREFIX"\n'
if not re.compile(r'(^|\n)[ \t]*TM_MAD_CONF\s*=\s*\[[^\]]*NAME\s*=\s*"vitastor"').search(oned_conf):
oned_conf += ('\nTM_MAD_CONF = [\n'+
' NAME = "vitastor", LN_TARGET = "NONE", CLONE_TARGET = "SELF", SHARED = "YES",\n'+
' DS_MIGRATE = "NO", DRIVER = "raw", ALLOW_ORPHANS="format",\n'+
' TM_MAD_SYSTEM = "ssh,shared", LN_TARGET_SSH = "SYSTEM", CLONE_TARGET_SSH = "SYSTEM",\n'+
' DISK_TYPE_SSH = "FILE", LN_TARGET_SHARED = "NONE",\n'+
' CLONE_TARGET_SHARED = "SELF", DISK_TYPE_SHARED = "FILE"\n'+
']\n')
if not re.compile(r'(^|\n)[ \t]*DS_MAD_CONF\s*=\s*\[[^\]]*NAME\s*=\s*"vitastor"').search(oned_conf):
oned_conf += ('\nDS_MAD_CONF = [\n'+
' NAME = "vitastor",\n'+
' REQUIRED_ATTRS = "DISK_TYPE,BRIDGE_LIST",\n'+
' PERSISTENT_ONLY = "NO",\n'+
' MARKETPLACE_ACTIONS = "export"\n'+
']\n')
return oned_conf
def fix_vm_mad(self, vm_mad_params):
if re.compile(r'\bNAME\s*=\s*"kvm"').search(vm_mad_params):
vm_mad_params = re.sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_vm_mad_args(m.group(2))+'"', vm_mad_params)
self.kvm_found = 1
return vm_mad_params
def fix_vm_mad_args(self, args):
args = self.fix_vm_mad_override(args, 'deploy')
if self.save_restore:
args = self.fix_vm_mad_override(args, 'save')
args = self.fix_vm_mad_override(args, 'restore')
return args
def fix_vm_mad_override(self, args, override):
m = re.compile(r'-l (\S+)').search(args)
if m and re.compile(override+'='+override+'.vitastor').search(m.group(1)):
return args
elif m and re.compile(override+'=').search(m.group(1)):
self.errors.append(override+"= is already overridden in -l option in VM_MAD[NAME=kvm].ARGUMENTS")
return args
elif m:
return self.require_sub(r'-l (\S+)', lambda m: '-l '+m.group(1)+','+override+'='+override+'.vitastor', args, '-l option not found in VM_MAD[NAME=kvm].ARGUMENTS')
else:
return args+' -l '+override+'='+override+'.vitastor'
def fix_tm_mad(self, params):
return self.require_sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_tm_mad_args('d', m.group(2), "TM_MAD")+'"', params, "TM_MAD.ARGUMENTS not found")
def fix_tm_mad_args(self, opt, args, v):
return self.require_sub('(-'+opt+r') (\S+)', lambda m: self.fix_tm_mad_arg(m), args, "-"+opt+" option not found in "+v+".ARGUMENTS")
def fix_tm_mad_arg(self, m):
a = m.group(2).split(',')
if 'vitastor' not in a:
a += [ 'vitastor' ]
return m.group(1)+' '+(','.join(a))
def fix_datastore_mad(self, params):
params = self.require_sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_tm_mad_args('d', m.group(2), "DATASTORE_MAD")+'"', params, "DATASTORE_MAD.ARGUMENTS not found")
return self.require_sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_tm_mad_args('s', m.group(2), "DATASTORE_MAD")+'"', params, "")
fixer = Fixer()
oned_conf_file = ''
for arg in sys.argv[1:]:
if arg == '-s':
fixer.save_restore = 1
else:
oned_conf_file = arg
break
if not oned_conf_file:
sys.stderr.write("USAGE: ./patch-oned-conf.py [-s] /etc/one/oned.conf\n-s means also enable save.vitastor/restore.vitastor overrides\n")
sys.exit(1)
with open(oned_conf_file, 'r') as fd:
oned_conf = fd.read()
new_conf = fixer.fix(oned_conf)
if new_conf != oned_conf:
os.rename(oned_conf_file, oned_conf_file+'.bak')
with open(oned_conf_file, 'w') as fd:
fd.write(new_conf)
if len(fixer.errors) > 0:
sys.stderr.write("ERROR: Failed to patch "+oned_conf_file+", patch it manually. Errors:\n- "+('\n- '.join(fixer.errors))+'\n')
sys.exit(1)

View File

@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver from cinder.volume import driver
from cinder.volume import volume_utils from cinder.volume import volume_utils
VITASTOR_VERSION = '1.9.3' VITASTOR_VERSION = '1.9.1'
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)

View File

@ -306,12 +306,12 @@ index e5ff653a60..884ecc79ea 100644
+ etcd = virBufferContentAndReset(&buf); + etcd = virBufferContentAndReset(&buf);
+ } + }
+ +
+ if (virJSONValueObjectAdd(&ret, + if (virJSONValueObjectCreate(&ret,
+ "S:etcd-host", etcd, + "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query, + "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile, + "S:config-path", src->configFile,
+ "s:image", src->path, + "s:image", src->path,
+ NULL) < 0) + NULL) < 0)
+ return NULL; + return NULL;
+ +
+ return ret; + return ret;

View File

@ -1,172 +0,0 @@
diff --git a/block/meson.build b/block/meson.build
index f1262ec2ba..3cf3e23f16 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -114,6 +114,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index fbda17c987..3edac22aff 100644
--- a/meson.build
+++ b/meson.build
@@ -1510,6 +1510,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2351,6 +2371,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4510,6 +4531,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 0269fa0f16..4740ffdc27 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index aa40d44f1d..bbee6a0e9c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3203,7 +3203,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4286,6 +4286,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4742,6 +4764,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5183,6 +5206,20 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5405,6 +5442,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index c97079a38c..4623f552ec 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -444,6 +445,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 1.9.3 Version: 1.9.1
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-1.9.3.el7.tar.gz Source0: vitastor-1.9.1.el7.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel

View File

@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 1.9.3 Version: 1.9.1
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-1.9.3.el8.tar.gz Source0: vitastor-1.9.1.el8.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel

View File

@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 1.9.3 Version: 1.9.1
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-1.9.3.el9.tar.gz Source0: vitastor-1.9.1.el9.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel
@ -74,7 +74,7 @@ Vitastor library headers for development.
Summary: Vitastor - fio drivers Summary: Vitastor - fio drivers
Group: Development/Libraries Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release} Requires: vitastor-client = %{version}-%{release}
Requires: fio = 3.35-1.el9 Requires: fio = 3.27-8.el9
%description -n vitastor-fio %description -n vitastor-fio

View File

@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif() endif()
add_definitions(-DVITASTOR_VERSION="1.9.3") add_definitions(-DVITASTOR_VERSION="1.9.1")
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src) add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer) add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN}) if (${WITH_ASAN})

View File

@ -10,7 +10,7 @@ endif (IBVERBS_LIBRARIES)
add_library(vitastor_common STATIC add_library(vitastor_common STATIC
../util/epoll_manager.cpp etcd_state_client.cpp messenger.cpp ../util/addr_util.cpp ../util/epoll_manager.cpp etcd_state_client.cpp messenger.cpp ../util/addr_util.cpp
msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ../util/ringloop.cpp ../../json11/json11.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ../util/ringloop.cpp ../../json11/json11.cpp
http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../util/json_util.cpp ${MSGR_RDMA} http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ${MSGR_RDMA}
) )
target_link_libraries(vitastor_common pthread) target_link_libraries(vitastor_common pthread)
target_compile_options(vitastor_common PUBLIC -fPIC) target_compile_options(vitastor_common PUBLIC -fPIC)
@ -88,7 +88,7 @@ add_executable(test_cluster_client
EXCLUDE_FROM_ALL EXCLUDE_FROM_ALL
../test/test_cluster_client.cpp ../test/test_cluster_client.cpp
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp ../test/mock/messenger.cpp msgr_stop.cpp pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp ../test/mock/messenger.cpp msgr_stop.cpp
etcd_state_client.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp etcd_state_client.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../../json11/json11.cpp
) )
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__) target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
target_include_directories(test_cluster_client BEFORE PUBLIC ${CMAKE_SOURCE_DIR}/src/test/mock) target_include_directories(test_cluster_client BEFORE PUBLIC ${CMAKE_SOURCE_DIR}/src/test/mock)

View File

@ -4,7 +4,7 @@
#include <stdexcept> #include <stdexcept>
#include <assert.h> #include <assert.h>
#include "cluster_client_impl.h" #include "cluster_client_impl.h"
#include "json_util.h" #include "http_client.h" // json_is_true
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config) cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
{ {
@ -955,7 +955,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
? (stripe + pg_block_size) : (op->offset + op->len); ? (stripe + pg_block_size) : (op->offset + op->len);
op->parts[i].iov.reset(); op->parts[i].iov.reset();
op->parts[i].flags = 0; op->parts[i].flags = 0;
if (op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->cur_inode != op->inode || op->opcode == OSD_OP_READ && dirty_copied) if (op->cur_inode != op->inode || op->opcode == OSD_OP_READ && dirty_copied)
{ {
// Read remaining parts from upper layers // Read remaining parts from upper layers
uint64_t prev = begin, cur = begin; uint64_t prev = begin, cur = begin;

View File

@ -15,7 +15,6 @@
#include "addr_util.h" #include "addr_util.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
#include "json11/json11.hpp" #include "json11/json11.hpp"
#include "http_client.h" #include "http_client.h"
#include "timerfd_manager.h" #include "timerfd_manager.h"
@ -62,7 +61,6 @@ struct http_co_t
inline void end() { ended = true; if (!onstack) { delete this; } } inline void end() { ended = true; if (!onstack) { delete this; } }
void run_cb_and_clear(); void run_cb_and_clear();
void start_connection(); void start_connection();
void start_ws_connection();
void close_connection(); void close_connection();
void next_request(); void next_request();
void handle_events(); void handle_events();
@ -113,7 +111,7 @@ http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, cons
handler->keepalive = false; handler->keepalive = false;
handler->request = request; handler->request = request;
handler->response_callback = response_callback; handler->response_callback = response_callback;
handler->start_ws_connection(); handler->start_connection();
return handler; return handler;
} }
@ -283,27 +281,6 @@ void http_co_t::close_connection()
epoll_events = 0; epoll_events = 0;
} }
void http_co_t::start_ws_connection()
{
stackin();
start_connection();
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
stackin();
if (state != HTTP_CO_WEBSOCKET)
{
close_connection();
parsed = { .error = "Websocket connection timed out" };
run_cb_and_clear();
}
stackout();
});
}
stackout();
}
void http_co_t::start_connection() void http_co_t::start_connection()
{ {
stackin(); stackin();
@ -747,3 +724,22 @@ static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
buf = buf.substr(hdr+len); buf = buf.substr(hdr+len);
return true; return true;
} }
// FIXME: move to utils
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}
bool json_is_false(const json11::Json & val)
{
if (val.is_string())
return val.string_value() == "false" || val.string_value() == "no" || val.string_value() == "0";
if (val.is_number())
return val.number_value() == 0;
if (val.is_bool())
return !val.bool_value();
return false;
}

View File

@ -48,3 +48,9 @@ void http_request(http_co_t *handler, const std::string & host, const std::strin
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback); const http_options_t & options, std::function<void(const http_response_t *response)> response_callback);
void http_post_message(http_co_t *handler, int type, const std::string & msg); void http_post_message(http_co_t *handler, int type, const std::string & msg);
void http_close(http_co_t *co); void http_close(http_co_t *co);
// Utils
std::string strtolower(const std::string & in);
// FIXME: move to json11
bool json_is_true(const json11::Json & val);
bool json_is_false(const json11::Json & val);

View File

@ -177,7 +177,7 @@ protected:
std::vector<int> read_ready_clients; std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients; std::vector<int> write_ready_clients;
// We don't use ringloop->set_immediate here because we may have no ringloop in client :) // We don't use ringloop->set_immediate here because we may have no ringloop in client :)
std::vector<osd_op_t*> set_immediate_ops; std::vector<std::function<void()>> set_immediate;
public: public:
timerfd_manager_t *tfd; timerfd_manager_t *tfd;
@ -237,8 +237,6 @@ protected:
void handle_op_hdr(osd_client_t *cl); void handle_op_hdr(osd_client_t *cl);
bool handle_reply_hdr(osd_client_t *cl); bool handle_reply_hdr(osd_client_t *cl);
void handle_reply_ready(osd_op_t *op); void handle_reply_ready(osd_op_t *op);
void handle_immediate_ops();
void clear_immediate_ops(int peer_fd);
#ifdef WITH_RDMA #ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl); void try_send_rdma(osd_client_t *cl);

View File

@ -598,7 +598,6 @@ void osd_messenger_t::handle_rdma_events()
} }
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status)); fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
stop_client(client_id); stop_client(client_id);
clear_immediate_ops(client_id);
continue; continue;
} }
if (!is_send) if (!is_send)
@ -607,7 +606,6 @@ void osd_messenger_t::handle_rdma_events()
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len)) if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
{ {
// handle_read_buffer may stop the client // handle_read_buffer may stop the client
clear_immediate_ops(client_id);
continue; continue;
} }
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]); try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
@ -668,5 +666,9 @@ void osd_messenger_t::handle_rdma_events()
} }
} }
} while (event_count > 0); } while (event_count > 0);
handle_immediate_ops(); for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
} }

View File

@ -65,7 +65,6 @@ void osd_messenger_t::read_requests()
bool osd_messenger_t::handle_read(int result, osd_client_t *cl) bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
bool ret = false; bool ret = false;
int peer_fd = cl->peer_fd;
cl->read_msg.msg_iovlen = 0; cl->read_msg.msg_iovlen = 0;
cl->refs--; cl->refs--;
if (cl->peer_state == PEER_STOPPED) if (cl->peer_state == PEER_STOPPED)
@ -102,8 +101,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
if (!handle_read_buffer(cl, cl->in_buf, result)) if (!handle_read_buffer(cl, cl->in_buf, result))
{ {
clear_immediate_ops(peer_fd); goto fin;
return false;
} }
} }
else else
@ -115,8 +113,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
if (!handle_finished_read(cl)) if (!handle_finished_read(cl))
{ {
clear_immediate_ops(peer_fd); goto fin;
return false;
} }
} }
} }
@ -125,47 +122,15 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
ret = true; ret = true;
} }
} }
handle_immediate_ops(); fin:
for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
return ret; return ret;
} }
void osd_messenger_t::clear_immediate_ops(int peer_fd)
{
size_t i = 0, j = 0;
while (i < set_immediate_ops.size())
{
if (set_immediate_ops[i]->peer_fd == peer_fd)
{
delete set_immediate_ops[i];
}
else
{
if (i != j)
set_immediate_ops[j] = set_immediate_ops[i];
j++;
}
i++;
}
set_immediate_ops.resize(j);
}
void osd_messenger_t::handle_immediate_ops()
{
for (auto op: set_immediate_ops)
{
if (op->op_type == OSD_OP_IN)
{
exec_op(op);
}
else
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
}
set_immediate_ops.clear();
}
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain) bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
{ {
// Compose operation(s) from the buffer // Compose operation(s) from the buffer
@ -234,7 +199,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
{ {
// Operation is ready // Operation is ready
cl->received_ops.push_back(cl->read_op); cl->received_ops.push_back(cl->read_op);
set_immediate_ops.push_back(cl->read_op); set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
cl->read_op = NULL; cl->read_op = NULL;
cl->read_state = 0; cl->read_state = 0;
} }
@ -330,7 +295,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{ {
// Operation is ready // Operation is ready
cl->received_ops.push_back(cur_op); cl->received_ops.push_back(cur_op);
set_immediate_ops.push_back(cur_op); set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
cl->read_op = NULL; cl->read_op = NULL;
cl->read_state = 0; cl->read_state = 0;
} }
@ -451,5 +416,9 @@ void osd_messenger_t::handle_reply_ready(osd_op_t *op)
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 + (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000 (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
); );
set_immediate_ops.push_back(op); set_immediate.push_back([op]()
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
});
} }

View File

@ -16,6 +16,7 @@
#include "qapi/error.h" #include "qapi/error.h"
#include "qapi/qmp/qdict.h" #include "qapi/qmp/qdict.h"
#include "qapi/qmp/qerror.h" #include "qapi/qmp/qerror.h"
#include "qemu/uri.h"
#include "qemu/error-report.h" #include "qemu/error-report.h"
#include "qemu/module.h" #include "qemu/module.h"
#include "qemu/option.h" #include "qemu/option.h"
@ -1020,11 +1021,7 @@ static BlockDriver bdrv_vitastor = {
// FIXME: Implement it along with per-inode statistics // FIXME: Implement it along with per-inode statistics
//.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size, //.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
#if QEMU_VERSION_MAJOR > 9 || QEMU_VERSION_MAJOR == 9 && QEMU_VERSION_MINOR > 0
.bdrv_open = vitastor_file_open,
#else
.bdrv_file_open = vitastor_file_open, .bdrv_file_open = vitastor_file_open,
#endif
.bdrv_close = vitastor_close, .bdrv_close = vitastor_close,
// Option list for the create operation // Option list for the create operation

View File

@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor Name: Vitastor
Description: Vitastor client library Description: Vitastor client library
Version: 1.9.3 Version: 1.9.1
Libs: -L${libdir} -lvitastor_client Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir} Cflags: -I${includedir}

View File

@ -369,7 +369,6 @@ struct cli_dd_t
{ {
cli_tool_t *parent; cli_tool_t *parent;
std::vector<std::string> conv, iflag, oflag;
dd_in_info_t iinfo; dd_in_info_t iinfo;
dd_out_info_t oinfo; dd_out_info_t oinfo;
@ -431,7 +430,7 @@ struct cli_dd_t
if (read_op->retval < 0) if (read_op->retval < 0)
{ {
fprintf( fprintf(
stderr, "Failed to read bitmap for %ju bytes from image %s at offset %ju: %s (code %d)\n", stderr, "Failed to read bitmap for %lu bytes from image %s at offset %lu: %s (code %d)\n",
read_op->len, iinfo.iimg.c_str(), read_op->offset, read_op->len, iinfo.iimg.c_str(), read_op->offset,
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
); );
@ -476,7 +475,7 @@ struct cli_dd_t
if (read_op->retval != read_op->len) if (read_op->retval != read_op->len)
{ {
fprintf( fprintf(
stderr, "Failed to read %ju bytes from image %s at offset %ju: %s (code %d)\n", stderr, "Failed to read %lu bytes from image %s at offset %lu: %s (code %d)\n",
read_op->len, iinfo.iimg.c_str(), read_op->offset, read_op->len, iinfo.iimg.c_str(), read_op->offset,
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
); );
@ -547,7 +546,7 @@ struct cli_dd_t
if (data->res < 0) if (data->res < 0)
{ {
fprintf( fprintf(
stderr, "Failed to read %ju bytes from %s at offset %ju: %s (code %d)\n", stderr, "Failed to read %lu bytes from %s at offset %lu: %s (code %d)\n",
data->iov.iov_len, iinfo.ifile == "" ? "stdin" : iinfo.ifile.c_str(), cur_read->offset, data->iov.iov_len, iinfo.ifile == "" ? "stdin" : iinfo.ifile.c_str(), cur_read->offset,
strerror(-data->res), data->res strerror(-data->res), data->res
); );
@ -644,7 +643,7 @@ struct cli_dd_t
if (write_op->retval != write_op->len) if (write_op->retval != write_op->len)
{ {
fprintf( fprintf(
stderr, "Failed to write %ju bytes to image %s at offset %ju: %s (code %d)\n", stderr, "Failed to write %lu bytes to image %s at offset %lu: %s (code %d)\n",
write_op->len, oinfo.oimg.c_str(), write_op->offset, write_op->len, oinfo.oimg.c_str(), write_op->offset,
strerror(write_op->retval < 0 ? -write_op->retval : EIO), write_op->retval strerror(write_op->retval < 0 ? -write_op->retval : EIO), write_op->retval
); );
@ -680,7 +679,7 @@ struct cli_dd_t
if (data->res < 0) if (data->res < 0)
{ {
fprintf( fprintf(
stderr, "Failed to write %ju bytes to %s at offset %ju: %s (code %d)\n", stderr, "Failed to write %lu bytes to %s at offset %lu: %s (code %d)\n",
data->iov.iov_len, oinfo.ofile == "" ? "stdout" : oinfo.ofile.c_str(), data->iov.iov_len, oinfo.ofile == "" ? "stdout" : oinfo.ofile.c_str(),
oinfo.out_seekable ? cur_read->offset+cur_read->len+oseek : 0, oinfo.out_seekable ? cur_read->offset+cur_read->len+oseek : 0,
strerror(-data->res), data->res strerror(-data->res), data->res
@ -727,7 +726,7 @@ struct cli_dd_t
{ {
char buf[256]; char buf[256];
snprintf( snprintf(
buf, sizeof(buf), "%ju bytes (%s) copied, %.1f s, %sB/s", buf, sizeof(buf), "%lu bytes (%s) copied, %.1f s, %sB/s",
written_size, format_size(written_size).c_str(), sec_total, written_size, format_size(written_size).c_str(), sec_total,
format_size((uint64_t)(written_size/sec_total), true).c_str() format_size((uint64_t)(written_size/sec_total), true).c_str()
); );
@ -749,7 +748,7 @@ struct cli_dd_t
else else
{ {
fprintf( fprintf(
stderr, "\r%ju bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K", stderr, "\r%lu bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K",
written_size, format_size(written_size).c_str(), sec_total, written_size, format_size(written_size).c_str(), sec_total,
format_size((uint64_t)(delta/sec_delta), true).c_str(), format_size((uint64_t)(delta/sec_delta), true).c_str(),
format_size((uint64_t)(written_size/sec_total), true).c_str() format_size((uint64_t)(written_size/sec_total), true).c_str()
@ -767,49 +766,6 @@ struct cli_dd_t
goto resume_3; goto resume_3;
else if (state == 4) else if (state == 4)
goto resume_4; goto resume_4;
for (int i = 0; i < conv.size(); i++)
{
if (conv[i] == "nofsync")
oinfo.end_fsync = false;
else if (conv[i] == "trunc")
oinfo.out_trunc = true;
else if (conv[i] == "nocreat")
oinfo.out_create = false;
else if (conv[i] == "noerror")
ignore_errors = true;
else if (conv[i] == "nosparse")
write_zero = true;
else
{
result = (cli_result_t){ .err = EINVAL, .text = "Unknown option conv="+conv[i] };
state = 100;
return;
}
}
for (int i = 0; i < iflag.size(); i++)
{
if (iflag[i] == "direct")
iinfo.in_direct = true;
else
{
result = (cli_result_t){ .err = EINVAL, .text = "Unknown option iflag="+iflag[i] };
state = 100;
return;
}
}
for (int i = 0; i < oflag.size(); i++)
{
if (oflag[i] == "direct")
oinfo.out_direct = true;
else if (oflag[i] == "append")
oinfo.out_append = true;
else
{
result = (cli_result_t){ .err = EINVAL, .text = "Unknown option oflag="+oflag[i] };
state = 100;
return;
}
}
if ((oinfo.oimg != "" && oinfo.ofile != "") || (iinfo.iimg != "" && iinfo.ifile != "")) if ((oinfo.oimg != "" && oinfo.ofile != "") || (iinfo.iimg != "" && iinfo.ifile != ""))
{ {
result = (cli_result_t){ .err = EINVAL, .text = "Image and file can't be specified at the same time" }; result = (cli_result_t){ .err = EINVAL, .text = "Image and file can't be specified at the same time" };
@ -952,18 +908,6 @@ static uint64_t parse_blocks(json11::Json v, uint64_t bs, uint64_t def)
return res; return res;
} }
static std::vector<std::string> explode_json(const std::string & sep, json11::Json opt)
{
if (opt.is_array())
{
std::vector<std::string> arr;
for (auto & item: opt.array_items())
arr.push_back(item.as_string());
return arr;
}
return explode(sep, opt.as_string(), true);
}
std::function<bool(cli_result_t &)> cli_tool_t::start_dd(json11::Json cfg) std::function<bool(cli_result_t &)> cli_tool_t::start_dd(json11::Json cfg)
{ {
auto dd = new cli_dd_t(); auto dd = new cli_dd_t();
@ -979,7 +923,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_dd(json11::Json cfg)
dd->oseek = parse_blocks(cfg["oseek"], dd->blocksize, 0); dd->oseek = parse_blocks(cfg["oseek"], dd->blocksize, 0);
if (!dd->oseek) if (!dd->oseek)
dd->oseek = parse_blocks(cfg["seek"], dd->blocksize, 0); dd->oseek = parse_blocks(cfg["seek"], dd->blocksize, 0);
dd->iseek = parse_blocks(cfg["iseek"], dd->blocksize, 0); dd->iseek = parse_blocks(cfg["oseek"], dd->blocksize, 0);
if (!dd->iseek) if (!dd->iseek)
dd->iseek = parse_blocks(cfg["skip"], dd->blocksize, 0); dd->iseek = parse_blocks(cfg["skip"], dd->blocksize, 0);
dd->iodepth = cfg["iodepth"].uint64_value(); dd->iodepth = cfg["iodepth"].uint64_value();
@ -991,9 +935,25 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_dd(json11::Json cfg)
progress = true; progress = true;
dd->iinfo.detect_size = cfg["size"].is_null(); dd->iinfo.detect_size = cfg["size"].is_null();
dd->oinfo.out_size = parse_size(cfg["size"].as_string()); dd->oinfo.out_size = parse_size(cfg["size"].as_string());
dd->conv = explode_json(",", cfg["conv"]); std::vector<std::string> conv = explode(",", cfg["conv"].string_value(), true);
dd->iflag = explode_json(",", cfg["iflag"]); if (std::find(conv.begin(), conv.end(), "nofsync") != conv.end())
dd->oflag = explode_json(",", cfg["oflag"]); dd->oinfo.end_fsync = false;
if (std::find(conv.begin(), conv.end(), "trunc") != conv.end())
dd->oinfo.out_trunc = true;
if (std::find(conv.begin(), conv.end(), "nocreat") != conv.end())
dd->oinfo.out_create = false;
if (std::find(conv.begin(), conv.end(), "noerror") != conv.end())
dd->ignore_errors = true;
if (std::find(conv.begin(), conv.end(), "nosparse") != conv.end())
dd->write_zero = true;
conv = explode(",", cfg["iflag"].string_value(), true);
if (std::find(conv.begin(), conv.end(), "direct") != conv.end())
dd->iinfo.in_direct = true;
conv = explode(",", cfg["oflag"].string_value(), true);
if (std::find(conv.begin(), conv.end(), "direct") != conv.end())
dd->oinfo.out_direct = true;
if (std::find(conv.begin(), conv.end(), "append") != conv.end())
dd->oinfo.out_append = true;
return [dd](cli_result_t & result) return [dd](cli_result_t & result)
{ {
dd->loop(); dd->loop();

View File

@ -5,7 +5,6 @@
#include "cluster_client.h" #include "cluster_client.h"
#include "pg_states.h" #include "pg_states.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
struct cli_fix_t struct cli_fix_t
{ {

View File

@ -21,3 +21,6 @@ template<class T> void remove_duplicates(std::vector<T> & ret)
} }
ret.resize(j+1); ret.resize(j+1);
} }
// from http_client.cpp...
bool json_is_false(const json11::Json & val);

View File

@ -4,7 +4,6 @@
#include "cli.h" #include "cli.h"
#include "cluster_client.h" #include "cluster_client.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
#include "http_client.h" #include "http_client.h"
// Reweight OSD, change tags or set noout flag // Reweight OSD, change tags or set noout flag

View File

@ -156,8 +156,6 @@ resume_1:
for (auto & jtag: osd_cfg["tags"].array_items()) for (auto & jtag: osd_cfg["tags"].array_items())
osd.tags.push_back(jtag.string_value()); osd.tags.push_back(jtag.string_value());
} }
else if (osd_cfg["tags"].is_string())
osd.tags.push_back(osd_cfg["tags"].string_value());
osd.noout = osd_cfg["noout"].bool_value(); osd.noout = osd_cfg["noout"].bool_value();
} }
auto np_it = node_placement.find(std::to_string(osd.num)); auto np_it = node_placement.find(std::to_string(osd.num));

View File

@ -4,7 +4,6 @@
#include "cli.h" #include "cli.h"
#include "cluster_client.h" #include "cluster_client.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
#include "pg_states.h" #include "pg_states.h"
#include "http_client.h" #include "http_client.h"

View File

@ -5,9 +5,8 @@ project(vitastor)
# vitastor-disk # vitastor-disk
add_executable(vitastor-disk add_executable(vitastor-disk
disk_tool.cpp disk_simple_offsets.cpp disk_tool.cpp disk_simple_offsets.cpp
disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
disk_tool_resize_auto.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp ../util/crc32c.c ../util/str_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
) )
target_link_libraries(vitastor-disk target_link_libraries(vitastor-disk
tcmalloc_minimal tcmalloc_minimal

View File

@ -27,16 +27,12 @@ static const char *help_text =
" --osd_per_disk <N>\n" " --osd_per_disk <N>\n"
" Create <N> OSDs on each disk (default 1)\n" " Create <N> OSDs on each disk (default 1)\n"
" --hybrid\n" " --hybrid\n"
" Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default,\n" " Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for\n"
" any passed SSDs will be used for journals and metadata, HDDs will be used for data,\n" " journals and metadata, HDDs will be used for data. Partitions for journals and\n"
" but you can override this behaviour with --fast-devices option. Journal and metadata\n" " metadata will be created automatically. Whether disks are SSD or HDD is decided\n"
" partitions will be created automatically. In the default mode, SSD and HDD disks\n" " by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object\n"
" are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used\n" " size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,\n"
" for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal\n" " and throttle_small_writes is enabled by default.\n"
" size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.\n"
" --fast-devices /dev/nvmeX,/dev/nvmeY\n"
" In --hybrid mode, use these devices for journal and metadata instead of auto-detecting\n"
" and extracting them from the main [devices...] list.\n"
" --disable_data_fsync auto\n" " --disable_data_fsync auto\n"
" Disable data device cache and fsync (1/yes/true = on, default auto)\n" " Disable data device cache and fsync (1/yes/true = on, default auto)\n"
" --disable_meta_fsync auto\n" " --disable_meta_fsync auto\n"
@ -96,22 +92,8 @@ static const char *help_text =
" \n" " \n"
" Requires the `sfdisk` utility.\n" " Requires the `sfdisk` utility.\n"
"\n" "\n"
"vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]\n" "vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]\n"
" Resize data area and/or move journal and metadata:\n" " Resize data area and/or rewrite/move journal and metadata\n"
" --move-journal TARGET move journal to TARGET\n"
" --move-meta TARGET move metadata to TARGET\n"
" --journal-size NEW_SIZE resize journal to NEW_SIZE\n"
" --data-size NEW_SIZE resize data device to NEW_SIZE\n"
" --dry-run only show new layout, do not apply it\n"
" \n"
" NEW_SIZE may include k/m/g/t suffixes.\n"
" TARGET may be one of:\n"
" <partition> move journal/metadata to an existing GPT partition\n"
" <raw_device> create a GPT partition on <raw_device> and move journal/metadata to it\n"
" \"\" (empty string) move journal/metadata back to the data device\n"
"\n"
"vitastor-disk raw-resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]\n"
" Resize data area and/or rewrite/move journal and metadata (manual format).\n"
" ALL_OSD_PARAMETERS must include all (at least all disk-related)\n" " ALL_OSD_PARAMETERS must include all (at least all disk-related)\n"
" parameters from OSD command line (i.e. from systemd unit or superblock).\n" " parameters from OSD command line (i.e. from systemd unit or superblock).\n"
" NEW_LAYOUT may include new disk layout parameters:\n" " NEW_LAYOUT may include new disk layout parameters:\n"
@ -161,10 +143,8 @@ static const char *help_text =
" For now, this only checks that device cache is in write-through mode if fsync is disabled.\n" " For now, this only checks that device cache is in write-through mode if fsync is disabled.\n"
" Intended for use from startup scripts (i.e. from systemd units).\n" " Intended for use from startup scripts (i.e. from systemd units).\n"
"\n" "\n"
"vitastor-disk dump-journal [OPTIONS] <osd_device>\n"
"vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>\n" "vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>\n"
" Dump journal in text or JSON (if --json is specified) format.\n" " Dump journal in human-readable or JSON (if --json is specified) format.\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
" Options:\n" " Options:\n"
" --all Scan the whole journal area for entries and dump them, even outdated ones\n" " --all Scan the whole journal area for entries and dump them, even outdated ones\n"
" --json Dump journal in JSON format\n" " --json Dump journal in JSON format\n"
@ -172,21 +152,16 @@ static const char *help_text =
" --format data Same as \"entries\", but also include small write data\n" " --format data Same as \"entries\", but also include small write data\n"
" --format blocks Dump as an array of journal blocks each containing array of entries\n" " --format blocks Dump as an array of journal blocks each containing array of entries\n"
"\n" "\n"
"vitastor-disk write-journal <osd_device>\n"
"vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>\n" "vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>\n"
" Write journal from JSON taken from standard input in the same format as produced by\n" " Write journal from JSON taken from standard input in the same format as produced by\n"
" `dump-journal --json --format data`.\n" " `dump-journal --json --format data`.\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
"\n" "\n"
"vitastor-disk dump-meta <osd_device>\n"
"vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>\n" "vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>\n"
" Dump metadata in JSON format.\n" " Dump metadata in JSON format.\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
"\n" "\n"
"vitastor-disk write-meta <osd_device>\n"
"vitastor-disk write-meta <meta_file> <offset> <size>\n" "vitastor-disk write-meta <meta_file> <offset> <size>\n"
" Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`.\n" " Write metadata from JSON taken from standard input in the same format as produced by\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n" " `dump-meta`. Intended for debugging.\n"
"\n" "\n"
"vitastor-disk simple-offsets <device>\n" "vitastor-disk simple-offsets <device>\n"
" Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n" " Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n"
@ -200,7 +175,6 @@ static const char *help_text =
" --device_size 0 Set device size\n" " --device_size 0 Set device size\n"
" --format text Result format: json, options, env, or text\n" " --format text Result format: json, options, env, or text\n"
"\n" "\n"
"Default I/O mode for commands involving disk I/O is O_DIRECT. If you don't want it, add --io cached.\n"
"Use vitastor-disk --help <command> for command details or vitastor-disk --help --all for all details.\n" "Use vitastor-disk --help <command> for command details or vitastor-disk --help --all for all details.\n"
; ;
@ -225,10 +199,6 @@ int main(int argc, char *argv[])
cmd.push_back((char*)"dump-journal"); cmd.push_back((char*)"dump-journal");
aliased = true; aliased = true;
} }
else if (!strcmp(exe_name, "vitastor-disk-test"))
{
self.test_mode = true;
}
for (int i = 1; i < argc; i++) for (int i = 1; i < argc; i++)
{ {
if (!strcmp(argv[i], "--all")) if (!strcmp(argv[i], "--all"))
@ -259,10 +229,6 @@ int main(int argc, char *argv[])
{ {
self.options["force"] = "1"; self.options["force"] = "1";
} }
else if (!strcmp(argv[i], "--dry-run") || !strcmp(argv[i], "--dry_run"))
{
self.options["dry_run"] = "1";
}
else if (!strcmp(argv[i], "--allow-data-loss")) else if (!strcmp(argv[i], "--allow-data-loss"))
{ {
self.options["allow_data_loss"] = "1"; self.options["allow_data_loss"] = "1";
@ -270,7 +236,7 @@ int main(int argc, char *argv[])
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1) else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
{ {
char *key = argv[i]+2; char *key = argv[i]+2;
self.options[str_replace(key, "-", "_")] = argv[++i]; self.options[key] = argv[++i];
} }
else else
{ {
@ -283,50 +249,29 @@ int main(int argc, char *argv[])
} }
if (!strcmp(cmd[0], "dump-journal")) if (!strcmp(cmd[0], "dump-journal"))
{ {
if (cmd.size() != 2 && cmd.size() < 5) if (cmd.size() < 5)
{ {
print_help(help_text, aliased ? "vitastor-dump-journal" : "vitastor-disk", cmd[0], false); print_help(help_text, aliased ? "vitastor-dump-journal" : "vitastor-disk", cmd[0], false);
return 1; return 1;
} }
self.dsk.journal_device = cmd[1]; self.dsk.journal_device = cmd[1];
if (cmd.size() > 2) self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10);
{ self.dsk.journal_offset = strtoull(cmd[3], NULL, 10);
self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10); self.dsk.journal_len = strtoull(cmd[4], NULL, 10);
self.dsk.journal_offset = strtoull(cmd[3], NULL, 10);
self.dsk.journal_len = strtoull(cmd[4], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.dsk.journal_device))
return 1;
}
return self.dump_journal(); return self.dump_journal();
} }
else if (!strcmp(cmd[0], "write-journal")) else if (!strcmp(cmd[0], "write-journal"))
{ {
if (cmd.size() != 2 && cmd.size() < 6) if (cmd.size() < 6)
{ {
print_help(help_text, "vitastor-disk", cmd[0], false); print_help(help_text, "vitastor-disk", cmd[0], false);
return 1; return 1;
} }
self.new_journal_device = cmd[1]; self.new_journal_device = cmd[1];
if (cmd.size() > 2) self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10);
{ self.dsk.clean_entry_bitmap_size = strtoul(cmd[3], NULL, 10);
self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10); self.new_journal_offset = strtoull(cmd[4], NULL, 10);
self.dsk.clean_entry_bitmap_size = strtoul(cmd[3], NULL, 10); self.new_journal_len = strtoull(cmd[5], NULL, 10);
self.new_journal_offset = strtoull(cmd[4], NULL, 10);
self.new_journal_len = strtoull(cmd[5], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.new_journal_device))
return 1;
self.new_journal_device = self.dsk.journal_device;
self.new_journal_offset = self.dsk.journal_offset;
self.new_journal_len = self.dsk.journal_len;
}
std::string json_err; std::string json_err;
json11::Json entries = json11::Json::parse(read_all_fd(0), json_err); json11::Json entries = json11::Json::parse(read_all_fd(0), json_err);
if (json_err != "") if (json_err != "")
@ -351,48 +296,27 @@ int main(int argc, char *argv[])
} }
else if (!strcmp(cmd[0], "dump-meta")) else if (!strcmp(cmd[0], "dump-meta"))
{ {
if (cmd.size() != 2 && cmd.size() < 5) if (cmd.size() < 5)
{ {
print_help(help_text, "vitastor-disk", cmd[0], false); print_help(help_text, "vitastor-disk", cmd[0], false);
return 1; return 1;
} }
self.dsk.meta_device = cmd[1]; self.dsk.meta_device = cmd[1];
if (cmd.size() > 2) self.dsk.meta_block_size = strtoul(cmd[2], NULL, 10);
{ self.dsk.meta_offset = strtoull(cmd[3], NULL, 10);
self.dsk.meta_block_size = strtoul(cmd[2], NULL, 10); self.dsk.meta_len = strtoull(cmd[4], NULL, 10);
self.dsk.meta_offset = strtoull(cmd[3], NULL, 10);
self.dsk.meta_len = strtoull(cmd[4], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.dsk.meta_device))
return 1;
}
return self.dump_meta(); return self.dump_meta();
} }
else if (!strcmp(cmd[0], "write-meta")) else if (!strcmp(cmd[0], "write-meta"))
{ {
if (cmd.size() != 2 && cmd.size() < 4) if (cmd.size() < 4)
{ {
print_help(help_text, "vitastor-disk", cmd[0], false); print_help(help_text, "vitastor-disk", cmd[0], false);
return 1; return 1;
} }
self.new_meta_device = cmd[1]; self.new_meta_device = cmd[1];
if (cmd.size() > 2) self.new_meta_offset = strtoull(cmd[2], NULL, 10);
{ self.new_meta_len = strtoull(cmd[3], NULL, 10);
self.new_meta_offset = strtoull(cmd[2], NULL, 10);
self.new_meta_len = strtoull(cmd[3], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.new_meta_device))
return 1;
self.new_meta_device = self.dsk.meta_device;
self.new_meta_offset = self.dsk.meta_offset;
self.new_meta_len = self.dsk.meta_len;
}
std::string json_err; std::string json_err;
json11::Json meta = json11::Json::parse(read_all_fd(0), json_err); json11::Json meta = json11::Json::parse(read_all_fd(0), json_err);
if (json_err != "") if (json_err != "")
@ -404,16 +328,7 @@ int main(int argc, char *argv[])
} }
else if (!strcmp(cmd[0], "resize")) else if (!strcmp(cmd[0], "resize"))
{ {
if (cmd.size() != 2) return self.resize_data();
{
fprintf(stderr, "Exactly 1 OSD number or OSD device path argument is required\n");
return 1;
}
return self.resize_data(cmd[1]);
}
else if (!strcmp(cmd[0], "raw-resize"))
{
return self.raw_resize();
} }
else if (!strcmp(cmd[0], "simple-offsets")) else if (!strcmp(cmd[0], "simple-offsets"))
{ {

View File

@ -22,7 +22,6 @@
#define VITASTOR_DISK_MAX_SB_SIZE 128*1024 #define VITASTOR_DISK_MAX_SB_SIZE 128*1024
#define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903" #define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903"
#define DEFAULT_HYBRID_JOURNAL "1G" #define DEFAULT_HYBRID_JOURNAL "1G"
#define DEFAULT_HYBRID_SSD_JOURNAL "128M"
struct resizer_data_moving_t; struct resizer_data_moving_t;
@ -41,7 +40,6 @@ struct disk_tool_t
/**** Parameters ****/ /**** Parameters ****/
std::map<std::string, std::string> options; std::map<std::string, std::string> options;
bool test_mode = false;
bool all, json, now; bool all, json, now;
bool dump_with_blocks, dump_with_data; bool dump_with_blocks, dump_with_data;
blockstore_disk_t dsk; blockstore_disk_t dsk;
@ -95,16 +93,10 @@ struct disk_tool_t
void dump_meta_header(blockstore_meta_header_v2_t *hdr); void dump_meta_header(blockstore_meta_header_v2_t *hdr);
void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap); void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap);
int dump_load_check_superblock(const std::string & device);
int write_json_journal(json11::Json entries); int write_json_journal(json11::Json entries);
int write_json_meta(json11::Json meta); int write_json_meta(json11::Json meta);
int resize_data(std::string device); int resize_data();
int resize_parse_move_journal(std::map<std::string, std::string> & move_options, bool dry_run);
int resize_parse_move_meta(std::map<std::string, std::string> & move_options, bool dry_run);
int raw_resize();
int resize_parse_params(); int resize_parse_params();
void resize_init(blockstore_meta_header_v2_t *hdr); void resize_init(blockstore_meta_header_v2_t *hdr);
int resize_remap_blocks(); int resize_remap_blocks();
@ -122,14 +114,11 @@ struct disk_tool_t
int systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices); int systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices);
int pre_exec_osd(std::string device); int pre_exec_osd(std::string device);
int purge_devices(const std::vector<std::string> & devices); int purge_devices(const std::vector<std::string> & devices);
int clear_osd_superblock(const std::string & dev);
json11::Json read_osd_superblock(std::string device, bool expect_exist = true, bool ignore_nonref = false); json11::Json read_osd_superblock(std::string device, bool expect_exist = true, bool ignore_nonref = false);
uint32_t write_osd_superblock(std::string device, json11::Json params); uint32_t write_osd_superblock(std::string device, json11::Json params);
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1); int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
int check_existing_partition(std::string & dev_by_uuid);
int fix_partition_type(std::string & dev_by_uuid);
int prepare(std::vector<std::string> devices); int prepare(std::vector<std::string> devices);
std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices); std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes); json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
@ -144,13 +133,13 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output);
uint64_t sscanf_json(const char *fmt, const json11::Json & str); uint64_t sscanf_json(const char *fmt, const json11::Json & str);
void fromhexstr(const std::string & from, int bytes, uint8_t *to); void fromhexstr(const std::string & from, int bytes, uint8_t *to);
int disable_cache(std::string dev); int disable_cache(std::string dev);
uint64_t get_device_size(const std::string & dev, bool should_exist = false);
std::string get_parent_device(std::string dev); std::string get_parent_device(std::string dev);
bool json_is_true(const json11::Json & val);
int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err); int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err);
int write_zero(int fd, uint64_t offset, uint64_t size); int write_zero(int fd, uint64_t offset, uint64_t size);
json11::Json read_parttable(std::string dev); json11::Json read_parttable(std::string dev);
uint64_t dev_size_from_parttable(json11::Json pt); uint64_t dev_size_from_parttable(json11::Json pt);
uint64_t free_from_parttable(json11::Json pt); uint64_t free_from_parttable(json11::Json pt);
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid); int fix_partition_type(std::string dev_by_uuid);
std::string csum_type_str(uint32_t data_csum_type); std::string csum_type_str(uint32_t data_csum_type);
uint32_t csum_type_from_str(std::string data_csum_type); uint32_t csum_type_from_str(std::string data_csum_type);

View File

@ -18,7 +18,7 @@ int disk_tool_t::dump_journal()
printf("[\n"); printf("[\n");
if (all) if (all)
{ {
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY); dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.journal_fd < 0) if (dsk.journal_fd < 0)
{ {
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
@ -121,7 +121,7 @@ int disk_tool_t::dump_journal()
int disk_tool_t::process_journal(std::function<int(void*)> block_fn) int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
{ {
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY); dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.journal_fd < 0) if (dsk.journal_fd < 0)
{ {
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
@ -517,12 +517,6 @@ int disk_tool_t::write_json_journal(json11::Json entries)
uint32_t data_csum_size = !dsk.data_csum_type ? 0 : ne->small_write.len/dsk.csum_block_size*(dsk.data_csum_type & 0xFF); uint32_t data_csum_size = !dsk.data_csum_type ? 0 : ne->small_write.len/dsk.csum_block_size*(dsk.data_csum_type & 0xFF);
fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write) + data_csum_size); fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write) + data_csum_size);
fromhexstr(rec["data"].string_value(), ne->small_write.len, new_journal_data); fromhexstr(rec["data"].string_value(), ne->small_write.len, new_journal_data);
if (ne->small_write.len > 0 && !rec["data"].is_string())
{
fprintf(stderr, "Error: entry data is missing, please generate the dump with --json --format data\n");
free(new_journal_buf);
return 1;
}
if (dsk.data_csum_type) if (dsk.data_csum_type)
fromhexstr(rec["block_csums"].string_value(), data_csum_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write)); fromhexstr(rec["block_csums"].string_value(), data_csum_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write));
if (rec["data"].is_string()) if (rec["data"].is_string())

View File

@ -4,7 +4,6 @@
#include "disk_tool.h" #include "disk_tool.h"
#include "rw_blocking.h" #include "rw_blocking.h"
#include "osd_id.h" #include "osd_id.h"
#include "json_util.h"
int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn, int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn) std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn)
@ -14,7 +13,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT); fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
return 1; return 1;
} }
dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY); dsk.meta_fd = open(dsk.meta_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.meta_fd < 0) if (dsk.meta_fd < 0)
{ {
fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
@ -150,31 +149,6 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
return 0; return 0;
} }
int disk_tool_t::dump_load_check_superblock(const std::string & device)
{
json11::Json sb = read_osd_superblock(device, true, false);
if (sb.is_null())
return 1;
try
{
auto cfg = json_to_string_map(sb["params"].object_items());
dsk.parse_config(cfg);
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths(true);
}
catch (std::exception & e)
{
dsk.close_all();
fprintf(stderr, "%s\n", e.what());
return 1;
}
dsk.close_all();
return 0;
}
int disk_tool_t::dump_meta() int disk_tool_t::dump_meta()
{ {
int r = process_meta( int r = process_meta(
@ -202,7 +176,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
{ {
printf( printf(
"{\"version\":\"0.9\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u," "{\"version\":\"0.9\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,"
"\"data_csum_type\":\"%s\",\"csum_block_size\":%u,\"entries\":[\n", "\"data_csum_type\":%s,\"csum_block_size\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity, hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size
); );
@ -269,16 +243,12 @@ int disk_tool_t::write_json_meta(json11::Json meta)
? meta["data_block_size"].uint64_value() : 131072; ? meta["data_block_size"].uint64_value() : 131072;
new_hdr->bitmap_granularity = meta["bitmap_granularity"].uint64_value() new_hdr->bitmap_granularity = meta["bitmap_granularity"].uint64_value()
? meta["bitmap_granularity"].uint64_value() : 4096; ? meta["bitmap_granularity"].uint64_value() : 4096;
if (new_hdr->version >= BLOCKSTORE_META_FORMAT_V2) new_hdr->data_csum_type = meta["data_csum_type"].is_number()
{ ? meta["data_csum_type"].uint64_value()
new_hdr->data_csum_type = meta["data_csum_type"].is_number() : (meta["data_csum_type"].string_value() == "crc32c"
? meta["data_csum_type"].uint64_value() ? BLOCKSTORE_CSUM_CRC32C
: (meta["data_csum_type"].string_value() == "crc32c" : BLOCKSTORE_CSUM_NONE);
? BLOCKSTORE_CSUM_CRC32C new_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
: BLOCKSTORE_CSUM_NONE);
new_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
}
uint32_t new_clean_entry_header_size = (new_hdr->version == BLOCKSTORE_META_FORMAT_V1 uint32_t new_clean_entry_header_size = (new_hdr->version == BLOCKSTORE_META_FORMAT_V1
? sizeof(clean_disk_entry) : sizeof(clean_disk_entry) + 4 /*entry_csum*/); ? sizeof(clean_disk_entry) : sizeof(clean_disk_entry) + 4 /*entry_csum*/);
new_clean_entry_bitmap_size = (new_hdr->data_block_size / new_hdr->bitmap_granularity + 7) / 8; new_clean_entry_bitmap_size = (new_hdr->data_block_size / new_hdr->bitmap_granularity + 7) / 8;
@ -315,7 +285,8 @@ int disk_tool_t::write_json_meta(json11::Json meta)
fromhexstr(e["data_csum"].string_value(), new_data_csum_size, fromhexstr(e["data_csum"].string_value(), new_data_csum_size,
((uint8_t*)new_entry) + sizeof(clean_disk_entry) + 2*new_clean_entry_bitmap_size); ((uint8_t*)new_entry) + sizeof(clean_disk_entry) + 2*new_clean_entry_bitmap_size);
} }
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4); uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + sizeof(clean_disk_entry) +
2*new_clean_entry_bitmap_size + new_data_csum_size);
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4); *new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
} }
} }

View File

@ -3,7 +3,6 @@
#include "disk_tool.h" #include "disk_tool.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
#include "osd_id.h" #include "osd_id.h"
int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd) int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
@ -29,12 +28,18 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
}; };
if (options.find("force") == options.end()) if (options.find("force") == options.end())
{ {
std::string* all_devs[] = { &options["data_device"], &options["meta_device"], &options["journal_device"] }; std::vector<std::string> all_devs = { options["data_device"], options["meta_device"], options["journal_device"] };
for (int i = 0; i < 3; i++) for (int i = 0; i < all_devs.size(); i++)
{ {
auto & dev = *all_devs[i]; const auto & dev = all_devs[i];
if (dev == "") if (dev == "")
continue; continue;
if (dev.substr(0, 22) != "/dev/disk/by-partuuid/")
{
// Partitions should be identified by GPT partition UUID
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", dev.c_str());
return 1;
}
std::string real_dev = realpath_str(dev, false); std::string real_dev = realpath_str(dev, false);
if (real_dev == "") if (real_dev == "")
return 1; return 1;
@ -47,9 +52,24 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
return 1; return 1;
} }
if (i == 0 && is_hdd == -1) if (i == 0 && is_hdd == -1)
is_hdd = trim(read_file("/sys/block/"+parent_dev.substr(5)+"/queue/rotational")) == "1"; is_hdd = trim(read_file("/sys/block/"+parent_dev+"/queue/rotational")) == "1";
if (check_existing_partition(dev) != 0) std::string out;
if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "")
{
fprintf(stderr, "%s contains data, not creating OSD without --force. wipefs shows:\n%s", dev.c_str(), out.c_str());
return 1; return 1;
}
json11::Json sb = read_osd_superblock(dev, false);
if (!sb.is_null())
{
fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
return 1;
}
if (fix_partition_type(dev) != 0)
{
fprintf(stderr, "%s has incorrect type and we failed to change it to Vitastor type\n", dev.c_str());
return 1;
}
} }
} }
for (auto dev: std::vector<std::string>{"data", "meta", "journal"}) for (auto dev: std::vector<std::string>{"data", "meta", "journal"})
@ -108,11 +128,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
try try
{ {
dsk.parse_config(options); dsk.parse_config(options);
// Set all offsets to 4096 to calculate metadata size with excess dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.journal_offset = 4096;
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct");
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
dsk.open_journal(); dsk.open_journal();
@ -157,11 +173,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
return 1; return 1;
} }
std::string osd_num_str; std::string osd_num_str;
if (test_mode && options.find("osd_num") != options.end()) if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
{
osd_num_str = options["osd_num"];
}
else if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
{ {
dsk.close_all(); dsk.close_all();
return 1; return 1;
@ -175,8 +187,8 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
} }
sb["osd_num"] = osd_num; sb["osd_num"] = osd_num;
// Zero out metadata and journal // Zero out metadata and journal
if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_len) != 0 || if (write_zero(dsk.meta_fd, dsk.meta_offset, dsk.meta_len) != 0 ||
write_zero(dsk.journal_fd, sb["journal_offset"].uint64_value(), dsk.journal_len) != 0) write_zero(dsk.journal_fd, dsk.journal_offset, dsk.journal_len) != 0)
{ {
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno)); fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
dsk.close_all(); dsk.close_all();
@ -201,76 +213,52 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
if (sep_j) if (sep_j)
desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]); desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str()); fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str());
if (!test_mode || options.find("no_init") == options.end()) if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
{ {
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0) fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
{
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
return 1;
}
}
return 0;
}
int disk_tool_t::check_existing_partition(std::string & dev)
{
std::string out;
if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "")
{
fprintf(stderr, "%s contains data, not creating OSD without --force. wipefs shows:\n%s", dev.c_str(), out.c_str());
return 1;
}
json11::Json sb = read_osd_superblock(dev, false);
if (!sb.is_null())
{
fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
return 1;
}
if (fix_partition_type(dev) != 0)
{
fprintf(stderr, "%s has incorrect type and we failed to change it to Vitastor type\n", dev.c_str());
return 1; return 1;
} }
return 0; return 0;
} }
int disk_tool_t::fix_partition_type(std::string & dev)
{
std::string type_uuid = VITASTOR_PART_TYPE;
if (test_mode && options.find("part_type_uuid") != options.end())
{
type_uuid = options["part_type_uuid"];
}
return fix_partition_type_uuid(dev, type_uuid);
}
std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices) std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices)
{ {
std::vector<vitastor_dev_info_t> devinfo; std::vector<vitastor_dev_info_t> devinfo;
std::set<std::string> seen;
for (auto & dev: devices) for (auto & dev: devices)
{ {
if (seen.find(dev) != seen.end())
{
fprintf(stderr, "%s is specified multiple times, ignoring\n", dev.c_str());
continue;
}
// Check if the device is a whole disk // Check if the device is a whole disk
if (dev.substr(0, 5) != "/dev/") if (dev.substr(0, 5) != "/dev/")
{ {
fprintf(stderr, "%s does not start with /dev/, ignoring\n", dev.c_str()); fprintf(stderr, "%s does not start with /dev/, ignoring\n", dev.c_str());
continue; continue;
} }
struct stat sys_st; struct stat dev_st, sys_st;
uint64_t dev_size = get_device_size(dev, false); if (stat(dev.c_str(), &dev_st) < 0)
if (dev_size == UINT64_MAX)
{ {
if (errno == ENOENT)
{
fprintf(stderr, "%s does not exist, skipping\n", dev.c_str());
continue;
}
fprintf(stderr, "Error checking %s: %s\n", dev.c_str(), strerror(errno));
return {}; return {};
} }
else if (!dev_size) uint64_t dev_size = dev_st.st_size;
if (S_ISBLK(dev_st.st_mode))
{ {
fprintf(stderr, "%s does not exist, skipping\n", dev.c_str()); int fd = open(dev.c_str(), O_DIRECT|O_RDWR);
continue; if (fd < 0)
{
fprintf(stderr, "Failed to open %s: %s\n", dev.c_str(), strerror(errno));
return {};
}
if (ioctl(fd, BLKGETSIZE64, &dev_size) < 0)
{
fprintf(stderr, "Failed to get %s size: %s\n", dev.c_str(), strerror(errno));
close(fd);
return {};
}
close(fd);
} }
if (stat(("/sys/block/"+dev.substr(5)).c_str(), &sys_st) < 0) if (stat(("/sys/block/"+dev.substr(5)).c_str(), &sys_st) < 0)
{ {
@ -315,6 +303,10 @@ std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<
.free = !pt.is_null() ? free_from_parttable(pt) : dev_size, .free = !pt.is_null() ? free_from_parttable(pt) : dev_size,
}); });
} }
if (!devinfo.size())
{
fprintf(stderr, "No suitable devices found\n");
}
return devinfo; return devinfo;
} }
@ -345,7 +337,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
script += "+ "+size+" "+std::string(VITASTOR_PART_TYPE)+"\n"; script += "+ "+size+" "+std::string(VITASTOR_PART_TYPE)+"\n";
} }
std::string out; std::string out;
if (shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", devinfo.path }, script, &out, NULL) != 0) if (shell_exec({ "sfdisk", "--no-reread", "--force", devinfo.path }, script, &out, NULL) != 0)
{ {
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk\n", sizes.size()); fprintf(stderr, "Failed to add %zu partition(s) with sfdisk\n", sizes.size());
return {}; return {};
@ -365,61 +357,68 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size()); fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
return {}; return {};
} }
// Check if new devices exist, run partprobe if not, then wait until they appear // Check if new nodes exist and run partprobe if not
// FIXME: We could use parted instead of sfdisk because partprobe is already a part of parted // FIXME: We could use parted instead of sfdisk because partprobe is already a part of parted
int iter = 0, r;
while (true)
{
for (const auto & part: new_parts)
{
struct stat st;
if (stat(part["node"].string_value().c_str(), &st) < 0)
{
if (errno == ENOENT)
{
iter++;
// Run partprobe
std::string out;
if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL)) != 0)
{
fprintf(
stderr, iter == 1 && r == 255
? "partprobe utility is required to reread partition table while disk %s is in use\n"
: "partprobe failed to re-read partition table while disk %s is in use\n",
devinfo.path.c_str()
);
return {};
}
break;
}
else
{
fprintf(stderr, "Failed to lstat %s: %s\n", part["node"].string_value().c_str(), strerror(errno));
return {};
}
}
}
break;
}
// Wait until device symlinks in /dev/disk/by-partuuid/ appear
bool exists = false; bool exists = false;
const int max_iter = 300; // max 30 sec iter = 0;
int iter = 0; while (!exists && iter < 300) // max 30 sec
int r = 0;
while (!exists && iter < max_iter)
{ {
exists = true; exists = true;
for (const auto & part: new_parts) for (const auto & part: new_parts)
{ {
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value()); std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
struct stat st; struct stat st;
if (stat(part["node"].string_value().c_str(), &st) < 0 || if (lstat(link_path.c_str(), &st) < 0)
lstat(link_path.c_str(), &st) < 0)
{ {
if (errno == ENOENT) if (errno == ENOENT)
{
exists = false; exists = false;
if (iter == 4)
{
// Print message after 400ms
fprintf(stderr, "Waiting for %s to appear for up to %d sec...\n", link_path.c_str(), max_iter/10);
}
}
else else
{ {
fprintf(stderr, "Failed to stat %s or lstat %s: %s\n", part["node"].string_value().c_str(), fprintf(stderr, "Failed to lstat %s: %s\n", link_path.c_str(), strerror(errno));
link_path.c_str(), strerror(errno));
return {}; return {};
} }
} }
} }
if (exists) if (!exists)
{ {
break; struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
iter += (nanosleep(&ts, NULL) == 0);
} }
if (!exists && iter == 0)
{
// Run partprobe
std::string out;
r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL);
if (r != 0)
{
fprintf(
stderr, r == 255
? "partprobe utility is required to reread partition table while disk %s is in use\n"
: "partprobe failed to re-read partition table while disk %s is in use\n",
devinfo.path.c_str()
);
return {};
}
}
struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
iter += (nanosleep(&ts, NULL) == 0 || !iter);
} }
devinfo.pt = newpt; devinfo.pt = newpt;
devinfo.osd_part_count += sizes.size(); devinfo.osd_part_count += sizes.size();
@ -502,10 +501,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
{ {
blockstore_disk_t dsk; blockstore_disk_t dsk;
dsk.parse_config(options); dsk.parse_config(options);
dsk.journal_offset = 4096; dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
dsk.open_journal(); dsk.open_journal();
@ -515,7 +511,6 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
} }
catch (std::exception & e) catch (std::exception & e)
{ {
dsk.close_all();
fprintf(stderr, "%s\n", e.what()); fprintf(stderr, "%s\n", e.what());
return 1; return 1;
} }
@ -570,12 +565,9 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
{ {
if (options.find("data_device") != options.end() && options["data_device"] != "") if (options.find("data_device") != options.end() && options["data_device"] != "")
{ {
if (options.find("hybrid") != options.end() || if (options.find("hybrid") != options.end() || options.find("osd_per_disk") != options.end() || devices.size())
options.find("fast_devices") != options.end() ||
options.find("osd_per_disk") != options.end() ||
devices.size())
{ {
fprintf(stderr, "Device list (positional arguments), --osd_per_disk, --hybrid and --fast-devices are incompatible with --data_device\n"); fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
return 1; return 1;
} }
return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0); return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
@ -592,10 +584,8 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
auto devinfo = collect_devices(devices); auto devinfo = collect_devices(devices);
if (!devinfo.size()) if (!devinfo.size())
{ {
fprintf(stderr, "No suitable devices found\n");
return 1; return 1;
} }
bool explicit_fast = options.find("fast_devices") != options.end();
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]); uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
if (!osd_per_disk) if (!osd_per_disk)
osd_per_disk = 1; osd_per_disk = 1;
@ -614,55 +604,21 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
if (options.find("disable_meta_fsync") == options.end()) if (options.find("disable_meta_fsync") == options.end())
options["disable_meta_fsync"] = "auto"; options["disable_meta_fsync"] = "auto";
options["disable_journal_fsync"] = options["disable_meta_fsync"]; options["disable_journal_fsync"] = options["disable_meta_fsync"];
if (explicit_fast) for (auto & dev: devinfo)
if (!dev.is_hdd)
ssds.push_back(dev);
if (!ssds.size())
{ {
auto fast = explode(",", options["fast_devices"], true); fprintf(stderr, "No SSDs found\n");
ssds = collect_devices(fast); return 1;
if (!ssds.size())
{
fprintf(stderr, "No fast devices found\n");
return 1;
}
if (options["journal_size"] == "")
{
auto auto_journal_size = DEFAULT_HYBRID_SSD_JOURNAL;
for (auto & dev: devinfo)
{
if (dev.is_hdd)
{
auto_journal_size = DEFAULT_HYBRID_JOURNAL;
break;
}
}
options["journal_size"] = auto_journal_size;
}
} }
else else if (ssds.size() == devinfo.size())
{ {
std::vector<vitastor_dev_info_t> hdds; fprintf(stderr, "No HDDs found\n");
for (auto & dev: devinfo) return 1;
{
if (!dev.is_hdd)
ssds.push_back(dev);
else
hdds.push_back(dev);
}
if (!ssds.size())
{
fprintf(stderr, "No SSDs found\n");
return 1;
}
if (!hdds.size())
{
fprintf(stderr, "No HDDs found\n");
return 1;
}
devinfo = hdds;
if (options["journal_size"] == "")
{
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
}
} }
if (options["journal_size"] == "")
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
} }
else else
{ {
@ -672,28 +628,31 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
auto journal_size = options["journal_size"]; auto journal_size = options["journal_size"];
for (auto & dev: devinfo) for (auto & dev: devinfo)
{ {
// Select new partitions and create an OSD on each of them if (!hybrid || dev.is_hdd)
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
{ {
options["force"] = true; // Select new partitions and create an OSD on each of them
options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid); for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
if (hybrid)
{ {
// Select/create journal and metadata partitions options["force"] = true;
int r = get_meta_partition(ssds, options); options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
if (r != 0) if (hybrid)
{ {
return 1; // Select/create journal and metadata partitions
int r = get_meta_partition(ssds, options);
if (r != 0)
{
return 1;
}
options.erase("journal_size");
}
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, dev.is_hdd ? 1 : 0);
if (hybrid)
{
options["journal_size"] = journal_size;
options.erase("journal_device");
options.erase("meta_device");
} }
options.erase("journal_size");
}
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, dev.is_hdd ? 1 : 0);
if (hybrid)
{
options["journal_size"] = journal_size;
options.erase("journal_device");
options.erase("meta_device");
} }
} }
} }

View File

@ -18,7 +18,7 @@ struct resizer_data_moving_t
uint64_t old_loc, new_loc; uint64_t old_loc, new_loc;
}; };
int disk_tool_t::raw_resize() int disk_tool_t::resize_data()
{ {
int r; int r;
// Parse parameters // Parse parameters
@ -91,7 +91,7 @@ int disk_tool_t::resize_parse_params()
try try
{ {
dsk.parse_config(options); dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached"; dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
dsk.open_journal(); dsk.open_journal();
@ -114,10 +114,7 @@ int disk_tool_t::resize_parse_params()
new_data_offset = options.find("new_data_offset") != options.end() new_data_offset = options.find("new_data_offset") != options.end()
? parse_size(options["new_data_offset"]) : dsk.data_offset; ? parse_size(options["new_data_offset"]) : dsk.data_offset;
new_data_len = options.find("new_data_len") != options.end() new_data_len = options.find("new_data_len") != options.end()
? parse_size(options["new_data_len"]) ? parse_size(options["new_data_len"]) : dsk.data_len;
: (options.find("new_data_offset") != options.end()
? dsk.data_device_size-new_data_offset
: dsk.data_len);
new_meta_offset = options.find("new_meta_offset") != options.end() new_meta_offset = options.find("new_meta_offset") != options.end()
? parse_size(options["new_meta_offset"]) : dsk.meta_offset; ? parse_size(options["new_meta_offset"]) : dsk.meta_offset;
new_meta_len = options.find("new_meta_len") != options.end() new_meta_len = options.find("new_meta_len") != options.end()
@ -126,14 +123,6 @@ int disk_tool_t::resize_parse_params()
? parse_size(options["new_journal_offset"]) : dsk.journal_offset; ? parse_size(options["new_journal_offset"]) : dsk.journal_offset;
new_journal_len = options.find("new_journal_len") != options.end() new_journal_len = options.find("new_journal_len") != options.end()
? parse_size(options["new_journal_len"]) : dsk.journal_len; ? parse_size(options["new_journal_len"]) : dsk.journal_len;
if (new_data_len+new_data_offset > dsk.data_device_size)
new_data_len = dsk.data_device_size-new_data_offset;
if (new_meta_device == dsk.data_device && new_data_offset < new_meta_offset &&
new_data_len+new_data_offset > new_meta_offset)
new_data_len = new_meta_offset-new_data_offset;
if (new_journal_device == dsk.data_device && new_data_offset < new_journal_offset &&
new_data_len+new_data_offset > new_journal_offset)
new_data_len = new_journal_offset-new_data_offset;
if (new_meta_device == dsk.meta_device && if (new_meta_device == dsk.meta_device &&
new_journal_device == dsk.journal_device && new_journal_device == dsk.journal_device &&
new_data_offset == dsk.data_offset && new_data_offset == dsk.data_offset &&
@ -170,10 +159,10 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
dsk.data_csum_type = hdr->data_csum_type; dsk.data_csum_type = hdr->data_csum_type;
dsk.csum_block_size = hdr->csum_block_size; dsk.csum_block_size = hdr->csum_block_size;
} }
if (((new_data_offset-dsk.data_offset) % dsk.data_block_size)) if (((new_data_len-dsk.data_len) % dsk.data_block_size) ||
((new_data_offset-dsk.data_offset) % dsk.data_block_size))
{ {
fprintf(stderr, "Data alignment mismatch: old data offset is 0x%jx, new is 0x%jx, but alignment on %x should be equal\n", fprintf(stderr, "Data alignment mismatch\n");
dsk.data_offset, new_data_offset, dsk.data_block_size);
exit(1); exit(1);
} }
data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size; data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size;
@ -231,10 +220,10 @@ int disk_tool_t::resize_remap_blocks()
} }
for (uint64_t i = 0; i < free_last; i++) for (uint64_t i = 0; i < free_last; i++)
{ {
if (data_alloc->get(total_blocks-i-1)) if (data_alloc->get(total_blocks-i))
data_remap[total_blocks-i-1] = 0; data_remap[total_blocks-i] = 0;
else else
data_alloc->set(total_blocks-i-1, true); data_alloc->set(total_blocks-i, true);
} }
for (auto & p: data_remap) for (auto & p: data_remap)
{ {
@ -257,7 +246,7 @@ int disk_tool_t::resize_copy_data()
iodepth = 32; iodepth = 32;
} }
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth); ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
dsk.data_fd = open(dsk.data_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
if (dsk.data_fd < 0) if (dsk.data_fd < 0)
{ {
fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno));
@ -452,7 +441,7 @@ int disk_tool_t::resize_rewrite_journal()
int disk_tool_t::resize_write_new_journal() int disk_tool_t::resize_write_new_journal()
{ {
new_journal_fd = open(new_journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); new_journal_fd = open(new_journal_device.c_str(), O_DIRECT|O_RDWR);
if (new_journal_fd < 0) if (new_journal_fd < 0)
{ {
fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno));
@ -478,13 +467,12 @@ int disk_tool_t::resize_rewrite_meta()
blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf; blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf;
new_hdr->zero = 0; new_hdr->zero = 0;
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1; new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
new_hdr->version = BLOCKSTORE_META_FORMAT_V2; new_hdr->version = BLOCKSTORE_META_FORMAT_V1;
new_hdr->meta_block_size = dsk.meta_block_size; new_hdr->meta_block_size = dsk.meta_block_size;
new_hdr->data_block_size = dsk.data_block_size; new_hdr->data_block_size = dsk.data_block_size;
new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096; new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
new_hdr->data_csum_type = dsk.data_csum_type; new_hdr->data_csum_type = dsk.data_csum_type;
new_hdr->csum_block_size = dsk.csum_block_size; new_hdr->csum_block_size = dsk.csum_block_size;
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
}, },
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{ {
@ -493,7 +481,7 @@ int disk_tool_t::resize_rewrite_meta()
block_num = remap_it->second; block_num = remap_it->second;
if (block_num < free_first || block_num >= total_blocks-free_last) if (block_num < free_first || block_num >= total_blocks-free_last)
{ {
fprintf(stderr, "BUG: remapped block %ju not in range %ju..%ju\n", block_num, free_first, total_blocks-free_last); fprintf(stderr, "BUG: remapped block not in range\n");
exit(1); exit(1);
} }
block_num += data_idx_diff; block_num += data_idx_diff;
@ -506,8 +494,6 @@ int disk_tool_t::resize_rewrite_meta()
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size); memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
else else
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size); memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
} }
); );
if (r != 0) if (r != 0)
@ -521,7 +507,7 @@ int disk_tool_t::resize_rewrite_meta()
int disk_tool_t::resize_write_new_meta() int disk_tool_t::resize_write_new_meta()
{ {
new_meta_fd = open(new_meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); new_meta_fd = open(new_meta_device.c_str(), O_DIRECT|O_RDWR);
if (new_meta_fd < 0) if (new_meta_fd < 0)
{ {
fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno));

View File

@ -1,298 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "disk_tool.h"
#include "rw_blocking.h"
#include "str_util.h"
#include "json_util.h"
int disk_tool_t::resize_data(std::string device)
{
if (options.find("move_journal") == options.end() &&
options.find("move_data") == options.end() &&
options.find("journal_size") == options.end() &&
options.find("data_size") == options.end())
{
fprintf(stderr, "None of --move-journal, --move-data, --journal-size, --data-size options are specified - nothing to do!\n");
return 1;
}
if (stoull_full(device))
device = "/dev/vitastor/osd"+device+"-data";
json11::Json sb = read_osd_superblock(device, true, false);
if (sb.is_null())
return 1;
auto sb_params = json_to_string_map(sb["params"].object_items());
try
{
dsk.parse_config(sb_params);
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths(true);
}
catch (std::exception & e)
{
dsk.close_all();
fprintf(stderr, "%s\n", e.what());
return 1;
}
// Save FD numbers because calc_lengths() relies on them
int old_journal_fd = dsk.journal_fd, old_meta_fd = dsk.meta_fd, old_data_fd = dsk.data_fd;
dsk.close_all();
bool dry_run = options.find("dry_run") != options.end();
auto old_journal_device = dsk.journal_device;
auto old_meta_device = dsk.meta_device;
new_journal_len = dsk.journal_len;
if (options.find("journal_size") != options.end())
{
new_journal_len = parse_size(options["journal_size"]);
if (options.find("move_journal") == options.end())
options["move_journal"] = dsk.journal_device == dsk.data_device ? "" : dsk.journal_device;
}
uint64_t new_data_dev_size = 0;
if (options.find("data_size") != options.end())
{
new_data_dev_size = parse_size(options["data_size"]);
new_data_dev_size = options["data_size"] == "max" || new_data_dev_size > dsk.data_device_size
? dsk.data_device_size : new_data_dev_size;
dsk.data_device_size = new_data_dev_size;
dsk.cfg_data_size = 0;
dsk.journal_fd = old_journal_fd;
dsk.meta_fd = old_meta_fd;
dsk.data_fd = old_data_fd;
dsk.calc_lengths(true);
dsk.journal_fd = -1;
dsk.meta_fd = -1;
dsk.data_fd = -1;
}
std::map<std::string, std::string> move_options;
if (options.find("move_journal") != options.end())
{
if (resize_parse_move_journal(move_options, dry_run) != 0)
return 1;
}
if (options.find("move_meta") != options.end())
{
if (resize_parse_move_meta(move_options, dry_run) != 0)
return 1;
}
auto new_journal_device = move_options.find("new_journal_device") != move_options.end()
? move_options["new_journal_device"] : dsk.journal_device;
auto new_meta_device = move_options.find("new_meta_device") != move_options.end()
? move_options["new_meta_device"] : dsk.meta_device;
// Calculate new data & meta offsets
new_data_offset = 4096 + (new_journal_device == dsk.data_device ? new_journal_len : 0) +
(new_meta_device == dsk.data_device ? dsk.meta_len : 0);
new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size);
if (new_data_offset != dsk.data_offset)
move_options["new_data_offset"] = std::to_string(new_data_offset);
if (new_data_dev_size != 0)
move_options["new_data_len"] = std::to_string(new_data_dev_size-new_data_offset);
new_meta_offset = 4096 + (new_meta_device == new_journal_device ? new_journal_len : 0);
if (new_meta_offset != dsk.meta_offset)
move_options["new_meta_offset"] = std::to_string(new_meta_offset);
// Run resize
auto orig_options = std::move(options);
options = sb_params;
for (auto & kv: move_options)
options[kv.first] = kv.second;
if (!json)
{
std::string cmd;
for (auto & kv: move_options)
cmd += " "+kv.first+" = "+kv.second+"\n";
fprintf(stderr, "Running resize:\n%s", cmd.c_str());
}
if (!dry_run && raw_resize() != 0)
return 1;
// Write new superblocks
json11::Json::object new_sb_params = sb["params"].object_items();
if (move_options.find("new_journal_device") != move_options.end())
new_sb_params["journal_device"] = move_options["new_journal_device"];
if (move_options.find("new_meta_device") != move_options.end())
new_sb_params["meta_device"] = move_options["new_meta_device"];
new_sb_params["data_offset"] = new_data_offset;
new_sb_params["meta_offset"] = new_meta_offset;
if (move_options.find("new_data_len") != move_options.end())
new_sb_params["data_size"] = stoull_full(move_options["new_data_len"]);
std::set<std::string> clear_superblocks, write_superblocks;
write_superblocks.insert(dsk.data_device);
write_superblocks.insert(new_journal_device);
write_superblocks.insert(new_meta_device);
if (write_superblocks.find(old_journal_device) == write_superblocks.end())
clear_superblocks.insert(old_journal_device);
if (write_superblocks.find(old_meta_device) == write_superblocks.end())
clear_superblocks.insert(old_meta_device);
for (auto & dev: clear_superblocks)
{
if (!json)
fprintf(stderr, "Clearing OSD superblock on %s\n", dev.c_str());
if (!dry_run && clear_osd_superblock(dev) != 0)
return 1;
}
for (auto & dev: write_superblocks)
{
if (!json)
fprintf(stderr, "Writing new OSD superblock to %s\n", dev.c_str());
if (!dry_run && !write_osd_superblock(dev, new_sb_params))
return 1;
}
if (json)
{
printf("%s\n", json11::Json(json11::Json::object {
{ "new_sb_params", new_sb_params },
}).dump().c_str());
}
return 0;
}
int disk_tool_t::resize_parse_move_journal(std::map<std::string, std::string> & move_options, bool dry_run)
{
if (options["move_journal"] == "")
{
// move back to the data device
// but first check if not already there :)
if (dsk.journal_device == dsk.data_device && new_journal_len == dsk.journal_len)
{
// already there
fprintf(stderr, "journal is already on data device and has the same size\n");
return 0;
}
move_options["new_journal_device"] = dsk.data_device;
move_options["new_journal_offset"] = "4096";
move_options["new_journal_len"] = std::to_string(new_journal_len);
}
else
{
std::string real_dev = realpath_str(options["move_journal"], false);
if (real_dev == "")
return 1;
std::string parent_dev = get_parent_device(real_dev);
if (parent_dev == "")
return 1;
if (parent_dev == real_dev)
{
// whole disk - create partition
std::string old_real_dev = realpath_str(dsk.journal_device);
if (old_real_dev == "")
return 1;
if (options.find("force") == options.end() &&
get_parent_device(old_real_dev) == parent_dev)
{
// already there
fprintf(stderr, "journal is already on a partition of %s, add --force to create a new partition\n", options["move_journal"].c_str());
return 0;
}
new_journal_len = ((new_journal_len+1024*1024-1)/1024/1024)*1024*1024;
if (!dry_run)
{
auto devinfos = collect_devices({ real_dev });
if (devinfos.size() == 0)
return 1;
std::vector<std::string> sizes;
sizes.push_back(std::to_string(new_journal_len/1024/1024)+"MiB");
auto new_parts = add_partitions(devinfos[0], sizes);
if (!new_parts.array_items().size())
return 1;
options["move_journal"] = "/dev/disk/by-partuuid/"+strtolower(new_parts[0]["uuid"].string_value());
}
else
options["move_journal"] = "<new journal partition on "+parent_dev+">";
}
else
{
// already a partition - check that it's a GPT partition with correct type
if ((options.find("force") == options.end()
? check_existing_partition(options["move_journal"])
: fix_partition_type(options["move_journal"])) != 0)
{
return 1;
}
new_journal_len = get_device_size(options["move_journal"], true);
if (new_journal_len == UINT64_MAX)
return 1;
}
new_journal_len -= 4096;
move_options["new_journal_device"] = options["move_journal"];
move_options["new_journal_offset"] = "4096";
move_options["new_journal_len"] = std::to_string(new_journal_len);
}
return 0;
}
int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & move_options, bool dry_run)
{
if (options["move_meta"] == "")
{
// move back to the data device
// but first check if not already there :)
if (dsk.meta_device == dsk.data_device)
{
// already there
fprintf(stderr, "metadata is already on data device\n");
return 0;
}
auto new_journal_device = move_options.find("new_journal_device") != move_options.end()
? move_options["new_journal_device"] : dsk.journal_device;
move_options["new_meta_device"] = dsk.data_device;
move_options["new_meta_len"] = std::to_string(dsk.meta_len);
}
else
{
std::string real_dev = realpath_str(options["move_meta"], false);
if (real_dev == "")
return 1;
std::string parent_dev = get_parent_device(real_dev);
if (parent_dev == "")
return 1;
uint64_t new_meta_len = 0;
if (parent_dev == real_dev)
{
// whole disk - create partition
std::string old_real_dev = realpath_str(dsk.meta_device);
if (old_real_dev == "")
return 1;
if (options.find("force") == options.end() &&
get_parent_device(old_real_dev) == parent_dev)
{
// already there
fprintf(stderr, "metadata is already on a partition of %s\n", options["move_meta"].c_str());
return 0;
}
new_meta_len = ((dsk.meta_len+1024*1024-1)/1024/1024)*1024*1024;
if (!dry_run)
{
auto devinfos = collect_devices({ real_dev });
if (devinfos.size() == 0)
return 1;
std::vector<std::string> sizes;
sizes.push_back(std::to_string(new_meta_len/1024/1024)+"MiB");
auto new_parts = add_partitions(devinfos[0], sizes);
if (!new_parts.array_items().size())
return 1;
options["move_meta"] = "/dev/disk/by-partuuid/"+strtolower(new_parts[0]["uuid"].string_value());
}
else
options["move_meta"] = "<new metadata partition on "+parent_dev+">";
}
else
{
// already a partition - check that it's a GPT partition with correct type
if ((options.find("force") == options.end()
? check_existing_partition(options["move_meta"])
: fix_partition_type(options["move_meta"])) != 0)
{
return 1;
}
new_meta_len = get_device_size(options["move_meta"], true);
if (new_meta_len == UINT64_MAX)
return 1;
}
new_meta_len -= 4096;
move_options["new_meta_len"] = std::to_string(new_meta_len);
move_options["new_meta_device"] = options["move_meta"];
move_options["new_meta_offset"] = "4096";
}
return 0;
}

View File

@ -6,7 +6,6 @@
#include "disk_tool.h" #include "disk_tool.h"
#include "rw_blocking.h" #include "rw_blocking.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
struct __attribute__((__packed__)) vitastor_disk_superblock_t struct __attribute__((__packed__)) vitastor_disk_superblock_t
{ {
@ -122,7 +121,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
sb->size = sb_size; sb->size = sb_size;
memcpy(sb->json_data, json_data.c_str(), json_data.size()); memcpy(sb->json_data, json_data.c_str(), json_data.size());
sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)); sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf));
int fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); int fd = open(device.c_str(), O_DIRECT|O_RDWR);
if (fd < 0) if (fd < 0)
{ {
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
@ -150,7 +149,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
json11::Json osd_params; json11::Json osd_params;
std::string json_err; std::string json_err;
std::string real_device, device_type, real_data, real_meta, real_journal; std::string real_device, device_type, real_data, real_meta, real_journal;
int r, fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); int r, fd = open(device.c_str(), O_DIRECT|O_RDWR);
if (fd < 0) if (fd < 0)
{ {
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
@ -382,34 +381,6 @@ int disk_tool_t::pre_exec_osd(std::string device)
return 0; return 0;
} }
int disk_tool_t::clear_osd_superblock(const std::string & dev)
{
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
int fd = -1, r = open(dev.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
if (r >= 0)
{
fd = r;
r = read_blocking(fd, buf, 4096);
if (r == 4096)
{
// Clear magic and CRC
memset(buf, 0, 12);
r = lseek64(fd, 0, 0);
if (r == 0)
{
r = write_blocking(fd, buf, 4096);
if (r == 4096)
r = 0;
}
}
}
if (fd >= 0)
close(fd);
free(buf);
buf = NULL;
return r;
}
int disk_tool_t::purge_devices(const std::vector<std::string> & devices) int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
{ {
std::set<uint64_t> osd_numbers; std::set<uint64_t> osd_numbers;
@ -468,6 +439,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
return 1; return 1;
} }
// Destroy OSD superblocks // Destroy OSD superblocks
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
for (auto & sb: superblocks) for (auto & sb: superblocks)
{ {
for (auto dev_type: std::vector<std::string>{ "data", "meta", "journal" }) for (auto dev_type: std::vector<std::string>{ "data", "meta", "journal" })
@ -475,7 +447,26 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
auto dev = sb["real_"+dev_type+"_device"].string_value(); auto dev = sb["real_"+dev_type+"_device"].string_value();
if (dev != "") if (dev != "")
{ {
int r = clear_osd_superblock(dev); int fd = -1, r = open(dev.c_str(), O_DIRECT|O_RDWR);
if (r >= 0)
{
fd = r;
r = read_blocking(fd, buf, 4096);
if (r == 4096)
{
// Clear magic and CRC
memset(buf, 0, 12);
r = lseek64(fd, 0, 0);
if (r == 0)
{
r = write_blocking(fd, buf, 4096);
if (r == 4096)
r = 0;
}
}
}
if (fd >= 0)
close(fd);
if (r != 0) if (r != 0)
{ {
fprintf(stderr, "Failed to clear OSD %ju %s device %s superblock: %s\n", fprintf(stderr, "Failed to clear OSD %ju %s device %s superblock: %s\n",
@ -496,7 +487,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
fprintf(stderr, "Failed to delete partition %s: failed to find parent device\n", dev.c_str()); fprintf(stderr, "Failed to delete partition %s: failed to find parent device\n", dev.c_str());
continue; continue;
} }
auto pt = read_parttable(parent_dev); auto pt = read_parttable("/dev/"+parent_dev);
if (!pt.is_object()) if (!pt.is_object())
continue; continue;
json11::Json::array newpt = pt["partitions"].array_items(); json11::Json::array newpt = pt["partitions"].array_items();
@ -507,7 +498,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
auto old_part = newpt[i]; auto old_part = newpt[i];
newpt.erase(newpt.begin()+i, newpt.begin()+i+1); newpt.erase(newpt.begin()+i, newpt.begin()+i+1);
vitastor_dev_info_t devinfo = { vitastor_dev_info_t devinfo = {
.path = parent_dev, .path = "/dev/"+parent_dev,
.pt = json11::Json::object{ { "partitions", newpt } }, .pt = json11::Json::object{ { "partitions", newpt } },
}; };
add_partitions(devinfo, {}); add_partitions(devinfo, {});
@ -516,7 +507,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
errno != ENOENT) errno != ENOENT)
{ {
std::string out; std::string out;
shell_exec({ "partprobe", parent_dev }, "", &out, NULL); shell_exec({ "partprobe", "/dev/"+parent_dev }, "", &out, NULL);
} }
break; break;
} }
@ -525,5 +516,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
} }
} }
} }
free(buf);
buf = NULL;
return 0; return 0;
} }

View File

@ -101,7 +101,7 @@ int disk_tool_t::upgrade_simple_unit(std::string unit)
resizer.options = options; resizer.options = options;
for (auto & kv: resize) for (auto & kv: resize)
resizer.options[kv.first] = std::to_string(kv.second); resizer.options[kv.first] = std::to_string(kv.second);
if (resizer.raw_resize() != 0) if (resizer.resize_data() != 0)
{ {
// FIXME: Resize with backup or journal // FIXME: Resize with backup or journal
fprintf( fprintf(

View File

@ -60,14 +60,14 @@ int disable_cache(std::string dev)
auto parent_dev = get_parent_device(dev); auto parent_dev = get_parent_device(dev);
if (parent_dev == "") if (parent_dev == "")
return 1; return 1;
auto scsi_disk = "/sys/block/"+parent_dev.substr(5)+"/device/scsi_disk"; auto scsi_disk = "/sys/block/"+parent_dev+"/device/scsi_disk";
DIR *dir = opendir(scsi_disk.c_str()); DIR *dir = opendir(scsi_disk.c_str());
if (!dir) if (!dir)
{ {
if (errno == ENOENT) if (errno == ENOENT)
{ {
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache // Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
return check_queue_cache(dev.substr(5), parent_dev.substr(5)); return check_queue_cache(dev.substr(5), parent_dev);
} }
else else
{ {
@ -84,7 +84,7 @@ int disable_cache(std::string dev)
{ {
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache // Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
closedir(dir); closedir(dir);
return check_queue_cache(dev.substr(5), parent_dev.substr(5)); return check_queue_cache(dev.substr(5), parent_dev);
} }
scsi_disk += "/"; scsi_disk += "/";
scsi_disk += de->d_name; scsi_disk += de->d_name;
@ -117,38 +117,6 @@ int disable_cache(std::string dev)
return 0; return 0;
} }
uint64_t get_device_size(const std::string & dev, bool should_exist)
{
struct stat dev_st;
if (stat(dev.c_str(), &dev_st) < 0)
{
if (errno == ENOENT && !should_exist)
{
return 0;
}
fprintf(stderr, "Error checking %s: %s\n", dev.c_str(), strerror(errno));
return UINT64_MAX;
}
uint64_t dev_size = dev_st.st_size;
if (S_ISBLK(dev_st.st_mode))
{
int fd = open(dev.c_str(), O_DIRECT|O_RDWR);
if (fd < 0)
{
fprintf(stderr, "Failed to open %s: %s\n", dev.c_str(), strerror(errno));
return UINT64_MAX;
}
if (ioctl(fd, BLKGETSIZE64, &dev_size) < 0)
{
fprintf(stderr, "Failed to get %s size: %s\n", dev.c_str(), strerror(errno));
close(fd);
return UINT64_MAX;
}
close(fd);
}
return dev_size;
}
std::string get_parent_device(std::string dev) std::string get_parent_device(std::string dev)
{ {
if (dev.substr(0, 5) != "/dev/") if (dev.substr(0, 5) != "/dev/")
@ -157,26 +125,16 @@ std::string get_parent_device(std::string dev)
return ""; return "";
} }
dev = dev.substr(5); dev = dev.substr(5);
// check if it's a partition - partitions aren't present in /sys/block/
struct stat st;
auto chk = "/sys/block/"+dev;
if (stat(chk.c_str(), &st) == 0)
{
// present in /sys/block/ - not a partition
return "/dev/"+dev;
}
else if (errno != ENOENT)
{
fprintf(stderr, "Failed to stat %s: %s\n", chk.c_str(), strerror(errno));
return "";
}
int i = dev.size(); int i = dev.size();
while (i > 0 && isdigit(dev[i-1])) while (i > 0 && isdigit(dev[i-1]))
i--; i--;
if (i >= 2 && dev[i-1] == 'p' && isdigit(dev[i-2])) // nvme0n1p1 if (i >= 1 && dev[i-1] == '-') // dm-0, dm-1
return dev;
else if (i >= 2 && dev[i-1] == 'p' && isdigit(dev[i-2])) // nvme0n1p1
i--; i--;
// Check that such block device exists // Check that such block device exists
chk = "/sys/block/"+dev.substr(0, i); struct stat st;
auto chk = "/sys/block/"+dev.substr(0, i);
if (stat(chk.c_str(), &st) < 0) if (stat(chk.c_str(), &st) < 0)
{ {
if (errno != ENOENT) if (errno != ENOENT)
@ -184,9 +142,16 @@ std::string get_parent_device(std::string dev)
fprintf(stderr, "Failed to stat %s: %s\n", chk.c_str(), strerror(errno)); fprintf(stderr, "Failed to stat %s: %s\n", chk.c_str(), strerror(errno));
return ""; return "";
} }
return "/dev/"+dev; return dev;
} }
return "/dev/"+dev.substr(0, i); return dev.substr(0, i);
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
} }
int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err) int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err)
@ -343,42 +308,23 @@ uint64_t free_from_parttable(json11::Json pt)
return free; return free;
} }
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid) int fix_partition_type(std::string dev_by_uuid)
{ {
bool is_partuuid = dev_by_uuid.substr(0, 22) == "/dev/disk/by-partuuid/"; auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
auto uuid = is_partuuid ? strtolower(dev_by_uuid.substr(22)) : ""; std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
auto node = realpath_str(dev_by_uuid, false);
std::string parent_dev = get_parent_device(node);
if (parent_dev == "") if (parent_dev == "")
return 1; return 1;
auto pt = read_parttable(parent_dev); auto pt = read_parttable("/dev/"+parent_dev);
if (pt.is_null() || pt.is_bool()) if (pt.is_null() || pt.is_bool())
return 1; return 1;
bool found = false;
std::string script = "label: gpt\n\n"; std::string script = "label: gpt\n\n";
for (const auto & part: pt["partitions"].array_items()) for (const auto & part: pt["partitions"].array_items())
{ {
bool this_part = (part["node"].string_value() == node) && bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
(!is_partuuid || strtolower(part["uuid"].string_value()) == uuid); if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
if (this_part)
{ {
found = true; // Already correct type
if (!is_partuuid) return 0;
{
if (part["uuid"] == "")
{
fprintf(stderr, "Could not determine partition UUID for %s. Please use GPT partitions\n", dev_by_uuid.c_str());
return 1;
}
auto new_dev = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
fprintf(stderr, "Using %s instead of %s\n", new_dev.c_str(), dev_by_uuid.c_str());
dev_by_uuid = new_dev;
}
if (strtolower(part["type"].string_value()) == type_uuid)
{
// Already correct type
return 0;
}
} }
script += part["node"].string_value()+": "; script += part["node"].string_value()+": ";
bool first = true; bool first = true;
@ -388,20 +334,15 @@ int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_
{ {
script += (first ? "" : ", ")+kv.first+"="+ script += (first ? "" : ", ")+kv.first+"="+
(kv.first == "type" && this_part (kv.first == "type" && this_part
? type_uuid ? "e7009fac-a5a1-4d72-af72-53de13059903"
: (kv.second.is_string() ? kv.second.string_value() : kv.second.dump())); : (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
first = false; first = false;
} }
} }
script += "\n"; script += "\n";
} }
if (!found)
{
fprintf(stderr, "Could not find partition table entry for %s\n", dev_by_uuid.c_str());
return 1;
}
std::string out; std::string out;
return shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", parent_dev }, script, &out, NULL); return shell_exec({ "sfdisk", "--no-reread", "--force", "/dev/"+parent_dev }, script, &out, NULL);
} }
std::string csum_type_str(uint32_t data_csum_type) std::string csum_type_str(uint32_t data_csum_type)

View File

@ -19,7 +19,6 @@
#include "addr_util.h" #include "addr_util.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
#include "nfs_proxy.h" #include "nfs_proxy.h"
#include "nfs_kv.h" #include "nfs_kv.h"
#include "nfs_block.h" #include "nfs_block.h"

View File

@ -14,7 +14,19 @@
#include "osd.h" #include "osd.h"
#include "http_client.h" #include "http_client.h"
#include "str_util.h" #include "str_util.h"
#include "json_util.h"
static blockstore_config_t json_to_bs(const json11::Json::object & config)
{
blockstore_config_t bs;
for (auto kv: config)
{
if (kv.second.is_string())
bs[kv.first] = kv.second.string_value();
else if (!kv.second.is_null())
bs[kv.first] = kv.second.dump();
}
return bs;
}
osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop) osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
{ {
@ -34,7 +46,7 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
if (!json_is_true(this->config["disable_blockstore"])) if (!json_is_true(this->config["disable_blockstore"]))
{ {
auto bs_cfg = json_to_string_map(this->config); auto bs_cfg = json_to_bs(this->config);
this->bs = new blockstore_t(bs_cfg, ringloop, tfd); this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
// Wait for blockstore initialisation before actually starting OSD logic // Wait for blockstore initialisation before actually starting OSD logic
// to prevent peering timeouts during restart with filled databases // to prevent peering timeouts during restart with filled databases
@ -139,7 +151,7 @@ void osd_t::parse_config(bool init)
} }
if (bs) if (bs)
{ {
auto bs_cfg = json_to_string_map(config); auto bs_cfg = json_to_bs(config);
bs->parse_config(bs_cfg); bs->parse_config(bs_cfg);
} }
st_cli.parse_config(config); st_cli.parse_config(config);

View File

@ -150,7 +150,7 @@ class osd_t
bool pg_config_applied = false; bool pg_config_applied = false;
bool etcd_reporting_pg_state = false; bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false; bool etcd_reporting_stats = false;
int print_stats_timer_id = -1, slow_log_timer_id = -1; int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
// peers and PGs // peers and PGs
@ -168,8 +168,6 @@ class osd_t
object_id recovery_last_oid; object_id recovery_last_oid;
int recovery_pg_done = 0, recovery_done = 0; int recovery_pg_done = 0, recovery_done = 0;
osd_op_t *autosync_op = NULL; osd_op_t *autosync_op = NULL;
int autosync_copies_to_delete = 0;
int autosync_timer_id = -1;
// Scrubbing // Scrubbing
uint64_t scrub_nearest_ts = 0; uint64_t scrub_nearest_ts = 0;

View File

@ -13,11 +13,10 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
bool first = true; bool first = true;
while (it != pg.flush_actions.end()) while (it != pg.flush_actions.end())
{ {
if (!first && if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.inode != prev_it->first.oid.inode || (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) && fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
(fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH || fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH))
{ {
// Stop only at the object boundary // Stop only at the object boundary
break; break;
@ -76,7 +75,6 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
// Throw the result away // Throw the result away
return; return;
} }
fb->flush_done++;
if (retval != 0) if (retval != 0)
{ {
if (peer_osd == this->osd_num) if (peer_osd == this->osd_num)
@ -94,11 +92,12 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
auto fd_it = msgr.osd_peer_fds.find(peer_osd); auto fd_it = msgr.osd_peer_fds.find(peer_osd);
if (fd_it != msgr.osd_peer_fds.end()) if (fd_it != msgr.osd_peer_fds.end())
{ {
// Will repeer/stop this PG
msgr.stop_client(fd_it->second); msgr.stop_client(fd_it->second);
} }
return;
} }
} }
fb->flush_done++;
if (fb->flush_done == fb->flush_ops) if (fb->flush_done == fb->flush_ops)
{ {
// This flush batch is done // This flush batch is done

View File

@ -645,18 +645,6 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
{ {
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state)); throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
} }
if (changed && immediate_commit != IMMEDIATE_ALL)
{
// Trigger double automatic sync after changing PG state when we're running with fsyncs.
// First autosync commits all written objects and applies copies_to_delete_after_sync;
// Second autosync commits all deletions run by the first sync.
// Without it, rebalancing in a cluster without load may result in some small amount of
// garbage left on "extra" OSDs of the PG, because last deletions are not synced at all.
// FIXME: 1000% correct way is to switch PG state only after copies_to_delete_after_sync.
// But it's much more complicated.
unstable_write_count += autosync_writes;
autosync_copies_to_delete = 2;
}
if (changed && report) if (changed && report)
{ {
report_pg_state(pg); report_pg_state(pg);

View File

@ -9,10 +9,6 @@ void osd_t::autosync()
{ {
if (immediate_commit != IMMEDIATE_ALL && !autosync_op) if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
{ {
if (autosync_copies_to_delete > 0)
{
autosync_copies_to_delete--;
}
autosync_op = new osd_op_t(); autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN; autosync_op->op_type = OSD_OP_IN;
autosync_op->peer_fd = SELF_FD; autosync_op->peer_fd = SELF_FD;
@ -33,11 +29,6 @@ void osd_t::autosync()
} }
delete autosync_op; delete autosync_op;
autosync_op = NULL; autosync_op = NULL;
if (autosync_copies_to_delete > 0)
{
// Trigger the second "copies_to_delete" autosync
autosync();
}
}; };
exec_op(autosync_op); exec_op(autosync_op);
} }

View File

@ -213,15 +213,6 @@ resume_8:
{ {
goto resume_6; goto resume_6;
} }
if (immediate_commit == IMMEDIATE_NONE)
{
// Mark OSDs as dirty because deletions have to be synced too!
for (int i = 0; i < op_data->copies_to_delete_count; i++)
{
auto & chunk = op_data->copies_to_delete[i];
this->dirty_osds.insert(chunk.osd_num);
}
}
} }
for (int i = 0; i < op_data->dirty_pg_count; i++) for (int i = 0; i < op_data->dirty_pg_count; i++)
{ {
@ -236,7 +227,7 @@ resume_8:
start_pg_peering(pg); start_pg_peering(pg);
} }
} }
// FIXME: Free those in the destructor (not here)? // FIXME: Free those in the destructor?
free(op_data->dirty_pgs); free(op_data->dirty_pgs);
op_data->dirty_pgs = NULL; op_data->dirty_pgs = NULL;
op_data->dirty_osds = NULL; op_data->dirty_osds = NULL;

View File

@ -7,12 +7,6 @@
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg) bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
{ {
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
// First check if PG is not active anymore
if (!(pg.state & PG_ACTIVE))
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
return false;
}
// Check if actions are pending for this object // Check if actions are pending for this object
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){ auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
.oid = op_data->oid, .oid = op_data->oid,

View File

@ -55,3 +55,10 @@ json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object &
{ {
return cli_config; return cli_config;
} }
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}

View File

@ -1,35 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include "json_util.h"
std::map<std::string, std::string> json_to_string_map(const json11::Json::object & config)
{
std::map<std::string, std::string> bs;
for (auto kv: config)
{
if (kv.second.is_string())
bs[kv.first] = kv.second.string_value();
else if (!kv.second.is_null())
bs[kv.first] = kv.second.dump();
}
return bs;
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}
bool json_is_false(const json11::Json & val)
{
if (val.is_string())
return val.string_value() == "false" || val.string_value() == "no" || val.string_value() == "0";
if (val.is_number())
return val.number_value() == 0;
if (val.is_bool())
return !val.bool_value();
return false;
}

View File

@ -1,13 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#pragma once
#include <map>
#include <string>
#include "json11/json11.hpp"
std::map<std::string, std::string> json_to_string_map(const json11::Json::object & config);
bool json_is_true(const json11::Json & val);
bool json_is_false(const json11::Json & val);

View File

@ -62,7 +62,7 @@ int timerfd_manager_t::set_timer_us(uint64_t micros, bool repeat, std::function<
.callback = callback, .callback = callback,
}); });
inc_timer(timers[timers.size()-1]); inc_timer(timers[timers.size()-1]);
set_nearest(false); set_nearest();
return timer_id; return timer_id;
} }
@ -82,13 +82,13 @@ void timerfd_manager_t::clear_timer(int timer_id)
{ {
nearest--; nearest--;
} }
set_nearest(false); set_nearest();
break; break;
} }
} }
} }
void timerfd_manager_t::set_nearest(bool trigger_inline) void timerfd_manager_t::set_nearest()
{ {
if (onstack > 0) if (onstack > 0)
{ {
@ -134,13 +134,10 @@ again:
} }
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0) if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
{ {
// It already happened - set minimal timeout // It already happened
if (trigger_inline) // FIXME: Postpone to setImmediate/BH to avoid reenterability problems
{ trigger_nearest();
trigger_nearest(); goto again;
goto again;
}
exp.it_value = { .tv_sec = 0, .tv_nsec = 1 };
} }
if (timerfd_settime(timerfd, 0, &exp, NULL)) if (timerfd_settime(timerfd, 0, &exp, NULL))
{ {
@ -160,7 +157,7 @@ void timerfd_manager_t::handle_readable()
trigger_nearest(); trigger_nearest();
} }
wait_state = 0; wait_state = 0;
set_nearest(true); set_nearest();
} }
void timerfd_manager_t::trigger_nearest() void timerfd_manager_t::trigger_nearest()

View File

@ -26,7 +26,7 @@ class timerfd_manager_t
std::vector<timerfd_timer_t> timers; std::vector<timerfd_timer_t> timers;
void inc_timer(timerfd_timer_t & t); void inc_timer(timerfd_timer_t & t);
void set_nearest(bool trigger_inline); void set_nearest();
void trigger_nearest(); void trigger_nearest();
void handle_readable(); void handle_readable();
public: public:

View File

@ -83,19 +83,16 @@ fi
POOLCFG='"name":"testpool","failure_domain":"osd",'$POOLCFG POOLCFG='"name":"testpool","failure_domain":"osd",'$POOLCFG
$ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$PG_COUNT'}}' $ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$PG_COUNT'}}'
wait_pool_up() wait_up()
{ {
local sec=$1 local sec=$1
local pool=$2
local pgsize=$3
local pgcount=$4
local i=0 local i=0
local configured=0 local configured=0
while [[ $i -lt $sec ]]; do while [[ $i -lt $sec ]]; do
if $ETCDCTL get /vitastor/pg/config --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["'$pool'"][] | if $ETCDCTL get /vitastor/pg/config --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] |
select(((.osd_set | select(. != 0) | sort | unique) | length) == '$pgsize') ] | length) == '$pgcount; then select(((.osd_set | select(. != 0) | sort | unique) | length) == '$PG_SIZE') ] | length) == '$PG_COUNT; then
configured=1 configured=1
if $ETCDCTL get /vitastor/pg/state/$pool/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$pgcount; then if $ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT; then
break break
fi fi
fi fi
@ -110,11 +107,6 @@ wait_pool_up()
done done
} }
wait_up()
{
wait_pool_up "$1" 1 $PG_SIZE $PG_COUNT
}
if [[ $OSD_COUNT -gt 0 ]]; then if [[ $OSD_COUNT -gt 0 ]]; then
wait_up 120 wait_up 120
fi fi

View File

@ -68,11 +68,6 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
./test_resize.sh
./test_resize_auto.sh
./test_snapshot_pool2.sh
./test_osd_tags.sh ./test_osd_tags.sh
./test_enospc.sh ./test_enospc.sh

View File

@ -3,7 +3,6 @@
PG_COUNT=${PG_COUNT:-32} PG_COUNT=${PG_COUNT:-32}
. `dirname $0`/run_3osds.sh . `dirname $0`/run_3osds.sh
check_qemu
LD_PRELOAD="build/src/client/libfio_vitastor.so" \ LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4M -direct=1 -iodepth=4 \ fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4M -direct=1 -iodepth=4 \
@ -27,22 +26,22 @@ for i in $(seq 1 $OSD_COUNT); do
offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin) offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin)
meta_offset=$(echo $offsets | jq -r .meta_offset) meta_offset=$(echo $offsets | jq -r .meta_offset)
data_offset=$(echo $offsets | jq -r .data_offset) data_offset=$(echo $offsets | jq -r .data_offset)
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
build/src/disk_tool/vitastor-disk raw-resize --io cached \ build/src/disk_tool/vitastor-disk resize \
$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \ $(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \
--new_meta_offset 0 \ --new_meta_offset 0 \
--new_meta_len $((1024*1024)) \ --new_meta_len $((1024*1024)) \
--new_journal_offset $((1024*1024)) \ --new_journal_offset $((1024*1024)) \
--new_data_offset $((128*1024*1024+32768)) --new_data_offset $((128*1024*1024))
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \ if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \
jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
format_error "OSD $i metadata corrupted after resizing" format_error "OSD $i metadata corrupted after resizing"
fi fi
if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \ if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then jq -e -s 'map([ .[].entries[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
format_error "OSD $i journal corrupted after resizing" format_error "OSD $i journal corrupted after resizing"
fi fi
done done
@ -54,7 +53,7 @@ for i in $(seq 1 $OSD_COUNT); do
--data_device ./testdata/bin/test_osd$i.bin \ --data_device ./testdata/bin/test_osd$i.bin \
--meta_offset 0 \ --meta_offset 0 \
--journal_offset $((1024*1024)) \ --journal_offset $((1024*1024)) \
--data_offset $((128*1024*1024+32768)) >>./testdata/osd$i.log 2>&1 & --data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$! eval OSD${i}_PID=$!
done done

View File

@ -1,94 +0,0 @@
#!/bin/bash -ex
ANTIETCD=1
. `dirname $0`/common.sh
[[ -e build/src/disk_tool/vitastor-disk-test ]] || ln -s vitastor-disk build/src/disk_tool/vitastor-disk-test
dd if=/dev/zero of=./testdata/bin/test_osd1.bin bs=1 count=1 seek=$((100*1024*1024*1024-1))
LOOP1=$(sudo losetup --show -f ./testdata/bin/test_osd1.bin)
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1"' || true' EXIT
dd if=/dev/zero of=./testdata/bin/test_meta.bin bs=1 count=1 seek=$((1024*1024*1024-1))
LOOP2=$(sudo losetup --show -f ./testdata/bin/test_meta.bin)
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1 $LOOP2"' || true' EXIT
# also test prepare --hybrid :)
# non-vitastor random type UUID to prevent udev activation
mount | grep '/dev type devtmpfs' || sudo mount udev /dev/ -t devtmpfs
sudo build/src/disk_tool/vitastor-disk-test prepare --no_init 1 --meta_reserve 1x,1M \
--block_size 131072 --osd_num 987654 --part_type_uuid 0df42ae0-3695-4395-a957-7d5ff3645c56 \
--hybrid --fast-devices $LOOP2 $LOOP1
# write almost empty journal
node <<EOF > ./testdata/journal.json
console.log(JSON.stringify([
{"type":"start","start":"0x1000"},
{"type":"big_write_instant","inode":"0x1000000000001","stripe":"0xc60000","ver":"10","offset":0,"len":131072,"loc":"0x18ffdc0000","bitmap":"ffffffff"}
]));
EOF
sudo build/src/disk_tool/vitastor-disk write-journal ${LOOP1}p1 < ./testdata/journal.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
diff ./testdata/j1.json ./testdata/j2.json
# write fake metadata items in the end
DATA_DEV_SIZE=$(sudo blockdev --getsize64 ${LOOP1}p1)
BLOCK_COUNT=$(((DATA_DEV_SIZE-4096)/128/1024))
node <<EOF > ./testdata/meta.json
console.log(JSON.stringify({
version: "0.9",
meta_block_size: 4096,
data_block_size: 131072,
bitmap_granularity: 4096,
data_csum_type: "none",
csum_block_size: 0,
entries: [ ...new Array(100).keys() ].map(i => ({
block: ($BLOCK_COUNT-100)+i,
pool: 1,
inode: "0x1",
stripe: "0x"+Number(i*0x20000).toString(16),
version: 10,
bitmap: "ffffffff",
ext_bitmap: "ffffffff",
})),
}));
EOF
# also test write & dump
sudo build/src/disk_tool/vitastor-disk write-meta ${LOOP1}p1 < ./testdata/meta.json
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 > ./testdata/compare.json
jq -S < ./testdata/meta.json > ./testdata/1.json
jq -S < ./testdata/compare.json > ./testdata/2.json
diff ./testdata/1.json ./testdata/2.json
# move journal & meta back, data will become smaller; end indexes should be shifted by -1251
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal '' --move-meta '' ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S '. + {"entries": [ .entries[] | (. + { "block": (.block-1251) }) ]}' < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
jq -S '[ (.[] + {"valid":true}) | (if .type == "big_write_instant" then . + {"loc":"0x18f6160000"} else . end) ]' < ./testdata/journal.json > ./testdata/j1.json
diff ./testdata/j1.json ./testdata/j2.json
# move journal & meta out, data will become larger; end indexes should be shifted back by +1251
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal ${LOOP2}p1 --move-meta ${LOOP2}p2 ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
# reduce data device size by exactly 128k * 99 (occupied blocks); exactly 1 should be left in place :)
sudo build/src/disk_tool/vitastor-disk-test resize --data-size $((DATA_DEV_SIZE-128*1024*99)) ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S '. + {"entries": ([ .entries[] | (. + { "block": (.block | if . > '$BLOCK_COUNT'-100 then .-('$BLOCK_COUNT'-100+1) else '$BLOCK_COUNT'-100 end) }) ] | .[1:] + [ .[0] ])}' < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
# extend data device size to maximum
sudo build/src/disk_tool/vitastor-disk-test resize --data-size max ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
diff ./testdata/1.json ./testdata/2.json
format_green OK

View File

@ -1,38 +0,0 @@
#!/bin/bash -ex
. `dirname $0`/run_3osds.sh
check_qemu
# snapshot in another pool
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL create-pool testpool2 -s 3 -n 4 --failure_domain osd
wait_pool_up 30 2 3 4
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL create -s 128M testchain -p testpool
LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 -fsync=1 -rw=write \
-etcd=$ETCD_URL -image=testchain -mirror_file=./testdata/bin/mirror.bin -buffer_pattern=0xabcd
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL snap-create testchain@snap1 -p testpool2
LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4k -direct=1 -iodepth=4 -end_fsync=1 -rw=randwrite -number_ios=32 \
-etcd=$ETCD_URL -image=testchain -mirror_file=./testdata/bin/mirror.bin -buffer_pattern=0xabcd
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL dd iimg=testchain of=./testdata/bin/res.bin bs=128k iodepth=4
cmp ./testdata/bin/res.bin ./testdata/bin/mirror.bin
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL dd iimg=testchain of=./testdata/bin/res.bin bs=32k iodepth=4 conv=nosparse
cmp ./testdata/bin/res.bin ./testdata/bin/mirror.bin
qemu-img convert -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testchain" \
-O raw ./testdata/bin/res.bin
cmp ./testdata/bin/res.bin ./testdata/bin/mirror.bin
format_green OK