forked from vitalif/vitastor
Compare commits
4 Commits
v0.9.1
...
hier-failu
Author | SHA1 | Date | |
---|---|---|---|
72f0cff79d | |||
c1d470522c | |||
57feb7f390 | |||
431f780347 |
@@ -550,111 +550,3 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_zero_osd_2:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: ZERO_OSD=2 /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_xor:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=xor /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_pg_size_3:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: PG_SIZE=3 /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub_ec:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=ec /root/vitastor/tests/test_scrub.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
|
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.9.1")
|
||||
set(VERSION "0.8.9")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.9.1
|
||||
VERSION ?= v0.8.9
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.9.1
|
||||
image: vitalif/vitastor-csi:v0.8.9
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.9.1
|
||||
image: vitalif/vitastor-csi:v0.8.9
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.9.1"
|
||||
vitastorCSIDriverVersion = "0.8.9"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (0.9.1-1) unstable; urgency=medium
|
||||
vitastor (0.8.9-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (0.9.1-1) unstable; urgency=medium
|
||||
vitastor (0.8.9-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.9.1; \
|
||||
cd vitastor-0.9.1; \
|
||||
cp -r /root/vitastor vitastor-0.8.9; \
|
||||
cd vitastor-0.8.9; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.1.orig.tar.xz vitastor-0.9.1; \
|
||||
cd vitastor-0.9.1; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.9.orig.tar.xz vitastor-0.8.9; \
|
||||
cd vitastor-0.8.9; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -25,16 +25,11 @@ running if required parameters are specified.
|
||||
## etcd_address
|
||||
|
||||
- Type: string or array of strings
|
||||
- Can be changed online: yes
|
||||
|
||||
etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
|
||||
specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
Note that https is not supported for etcd connections yet.
|
||||
|
||||
etcd connection endpoints can be changed online by updating global
|
||||
configuration in etcd itself - this allows to switch the cluster to new
|
||||
etcd addresses without downtime.
|
||||
|
||||
## etcd_prefix
|
||||
|
||||
- Type: string
|
||||
@@ -47,6 +42,5 @@ example, use a single etcd cluster for multiple Vitastor clusters.
|
||||
|
||||
- Type: integer
|
||||
- Default: 0
|
||||
- Can be changed online: yes
|
||||
|
||||
Log level. Raise if you want more verbose output.
|
||||
|
@@ -24,14 +24,10 @@
|
||||
## etcd_address
|
||||
|
||||
- Тип: строка или массив строк
|
||||
- Можно менять на лету: да
|
||||
|
||||
Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
|
||||
или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
|
||||
Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
|
||||
самом etcd - это позволяет переключить кластер на новые etcd без остановки.
|
||||
|
||||
## etcd_prefix
|
||||
|
||||
- Тип: строка
|
||||
@@ -45,6 +41,5 @@
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 0
|
||||
- Можно менять на лету: да
|
||||
|
||||
Уровень логгирования. Повысьте, если хотите более подробный вывод.
|
||||
|
@@ -153,7 +153,6 @@ operations.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Interval before attempting to reconnect to an unavailable OSD.
|
||||
|
||||
@@ -162,7 +161,6 @@ Interval before attempting to reconnect to an unavailable OSD.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for OSD connection attempts.
|
||||
|
||||
@@ -171,7 +169,6 @@ Timeout for OSD connection attempts.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
OSD connection inactivity time after which clients and other OSDs send
|
||||
keepalive requests to check state of the connection.
|
||||
@@ -181,7 +178,6 @@ keepalive requests to check state of the connection.
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Minimum: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
|
||||
within this time, the connection to it is dropped and a reconnection attempt
|
||||
@@ -192,7 +188,6 @@ is scheduled.
|
||||
- Type: milliseconds
|
||||
- Default: 500
|
||||
- Minimum: 50
|
||||
- Can be changed online: yes
|
||||
|
||||
OSDs respond to clients with a special error code when they receive I/O
|
||||
requests for a PG that's not synchronized and started. This parameter sets
|
||||
@@ -202,7 +197,6 @@ the time for the clients to wait before re-attempting such I/O requests.
|
||||
|
||||
- Type: integer
|
||||
- Default: 5
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum number of attempts for etcd requests which can't be retried
|
||||
indefinitely.
|
||||
@@ -211,7 +205,6 @@ indefinitely.
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 1000
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||
|
||||
@@ -219,7 +212,6 @@ Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 5000
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for etcd requests which are allowed to wait for some time.
|
||||
|
||||
@@ -227,7 +219,6 @@ Timeout for etcd requests which are allowed to wait for some time.
|
||||
|
||||
- Type: seconds
|
||||
- Default: max(30, etcd_report_interval*2)
|
||||
- Can be changed online: yes
|
||||
|
||||
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
|
||||
etcd_report_interval to guarantee that keepalive actually works.
|
||||
@@ -236,7 +227,6 @@ etcd_report_interval to guarantee that keepalive actually works.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 30
|
||||
- Can be changed online: yes
|
||||
|
||||
etcd websocket ping interval required to keep the connection alive and
|
||||
detect disconnections quickly.
|
||||
@@ -245,7 +235,6 @@ detect disconnections quickly.
|
||||
|
||||
- Type: integer
|
||||
- Default: 33554432
|
||||
- Can be changed online: yes
|
||||
|
||||
Without immediate_commit=all this parameter sets the limit of "dirty"
|
||||
(not committed by fsync) data allowed by the client before forcing an
|
||||
|
@@ -161,7 +161,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Время ожидания перед повторной попыткой соединиться с недоступным OSD.
|
||||
|
||||
@@ -170,7 +169,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время ожидания попытки соединения с OSD.
|
||||
|
||||
@@ -179,7 +177,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Время неактивности соединения с OSD, после которого клиенты или другие OSD
|
||||
посылают запрос проверки состояния соединения.
|
||||
@@ -189,7 +186,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Минимальное значение: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время ожидания ответа на запрос проверки состояния соединения.
|
||||
Если OSD не отвечает за это время, соединение отключается и производится
|
||||
@@ -200,7 +196,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 500
|
||||
- Минимальное значение: 50
|
||||
- Можно менять на лету: да
|
||||
|
||||
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
||||
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
|
||||
@@ -212,7 +207,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 5
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число попыток выполнения запросов к etcd для тех запросов,
|
||||
которые нельзя повторять бесконечно.
|
||||
@@ -221,7 +215,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 1000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время выполнения запросов к etcd, которые должны завершаться
|
||||
быстро, таких, как обновление резервации (lease).
|
||||
@@ -230,7 +223,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 5000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное время выполнения запросов к etcd, для которых не обязательно
|
||||
гарантировать быстрое выполнение.
|
||||
@@ -239,7 +231,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: max(30, etcd_report_interval*2)
|
||||
- Можно менять на лету: да
|
||||
|
||||
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
|
||||
etcd_report_interval, чтобы keepalive гарантированно работал.
|
||||
@@ -248,7 +239,6 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 30
|
||||
- Можно менять на лету: да
|
||||
|
||||
Интервал проверки живости вебсокет-подключений к etcd.
|
||||
|
||||
@@ -256,7 +246,6 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 33554432
|
||||
- Можно менять на лету: да
|
||||
|
||||
При работе без immediate_commit=all - это лимит объёма "грязных" (не
|
||||
зафиксированных fsync-ом) данных, при достижении которого клиент будет
|
||||
|
@@ -7,8 +7,7 @@
|
||||
# Runtime OSD Parameters
|
||||
|
||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||
initialization and can be changed - either with an OSD restart or, for some of
|
||||
them, even without restarting by updating configuration in etcd.
|
||||
initialization and can be changed with an OSD restart.
|
||||
|
||||
- [etcd_report_interval](#etcd_report_interval)
|
||||
- [run_primary](#run_primary)
|
||||
@@ -39,14 +38,6 @@ them, even without restarting by updating configuration in etcd.
|
||||
- [throttle_target_parallelism](#throttle_target_parallelism)
|
||||
- [throttle_threshold_us](#throttle_threshold_us)
|
||||
- [osd_memlock](#osd_memlock)
|
||||
- [auto_scrub](#auto_scrub)
|
||||
- [no_scrub](#no_scrub)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [scrub_queue_depth](#scrub_queue_depth)
|
||||
- [scrub_sleep](#scrub_sleep)
|
||||
- [scrub_list_limit](#scrub_list_limit)
|
||||
- [scrub_find_best](#scrub_find_best)
|
||||
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
@@ -100,7 +91,6 @@ OSD by hand.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 5
|
||||
- Can be changed online: yes
|
||||
|
||||
Time interval at which automatic fsyncs/flushes are issued by each OSD when
|
||||
the immediate_commit mode if disabled. fsyncs are required because without
|
||||
@@ -113,7 +103,6 @@ issue fsyncs at all.
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
- Can be changed online: yes
|
||||
|
||||
Same as autosync_interval, but sets the maximum number of uncommitted write
|
||||
operations before issuing an fsync operation internally.
|
||||
@@ -122,7 +111,6 @@ operations before issuing an fsync operation internally.
|
||||
|
||||
- Type: integer
|
||||
- Default: 4
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
@@ -132,7 +120,6 @@ and rebalancing, but it's planned to implement more.
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
@@ -143,7 +130,6 @@ Degraded PGs are anyway scanned first.
|
||||
|
||||
- Type: integer
|
||||
- Default: 16
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum number of recovery operations before issuing an additional fsync.
|
||||
|
||||
@@ -159,7 +145,6 @@ the underlying device. This may be useful for recovery purposes.
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Disable automatic background recovery of objects. Note that it doesn't
|
||||
affect implicit recovery of objects happening during writes - a write is
|
||||
@@ -169,7 +154,6 @@ always made to a full set of at least pg_minsize OSDs.
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Disable background movement of data between different OSDs. Disabling it
|
||||
means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||
@@ -178,7 +162,6 @@ means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 3
|
||||
- Can be changed online: yes
|
||||
|
||||
Time interval at which OSDs print simple human-readable operation
|
||||
statistics on stdout.
|
||||
@@ -187,7 +170,6 @@ statistics on stdout.
|
||||
|
||||
- Type: seconds
|
||||
- Default: 10
|
||||
- Can be changed online: yes
|
||||
|
||||
Time interval at which OSDs dump slow or stuck operations on stdout, if
|
||||
they're any. Also it's the time after which an operation is considered
|
||||
@@ -197,7 +179,6 @@ they're any. Also it's the time after which an operation is considered
|
||||
|
||||
- Type: seconds
|
||||
- Default: 60
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of seconds after which a deleted inode is removed from OSD statistics.
|
||||
|
||||
@@ -205,7 +186,6 @@ Number of seconds after which a deleted inode is removed from OSD statistics.
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
- Can be changed online: yes
|
||||
|
||||
Parallel client write operation limit per one OSD. Operations that exceed
|
||||
this limit are pushed to a temporary queue instead of being executed
|
||||
@@ -215,7 +195,6 @@ immediately.
|
||||
|
||||
- Type: integer
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Flusher is a micro-thread that moves data from the journal to the data
|
||||
area of the device. Their number is auto-tuned between minimum and maximum.
|
||||
@@ -225,7 +204,6 @@ Minimum number is set by this parameter.
|
||||
|
||||
- Type: integer
|
||||
- Default: 256
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum number of journal flushers (see above min_flusher_count).
|
||||
|
||||
@@ -282,7 +260,6 @@ Most (99%) other SSDs don't need this option.
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Enable soft throttling of small journaled writes. Useful for hybrid OSDs
|
||||
with fast journal/metadata devices and slow data devices. The idea is that
|
||||
@@ -300,7 +277,6 @@ fills up.
|
||||
|
||||
- Type: integer
|
||||
- Default: 100
|
||||
- Can be changed online: yes
|
||||
|
||||
Target maximum number of throttled operations per second under the condition
|
||||
of full journal. Set it to approximate random write iops of your data devices
|
||||
@@ -310,7 +286,6 @@ of full journal. Set it to approximate random write iops of your data devices
|
||||
|
||||
- Type: integer
|
||||
- Default: 100
|
||||
- Can be changed online: yes
|
||||
|
||||
Target maximum bandwidth in MB/s of throttled operations per second under
|
||||
the condition of full journal. Set it to approximate linear write
|
||||
@@ -320,7 +295,6 @@ performance of your data devices (HDDs).
|
||||
|
||||
- Type: integer
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Target maximum parallelism of throttled operations under the condition of
|
||||
full journal. Set it to approximate internal parallelism of your data
|
||||
@@ -330,7 +304,6 @@ devices (1 for HDDs, 4-8 for SSDs).
|
||||
|
||||
- Type: microseconds
|
||||
- Default: 50
|
||||
- Can be changed online: yes
|
||||
|
||||
Minimal computed delay to be applied to throttled operations. Usually
|
||||
doesn't need to be changed.
|
||||
@@ -340,103 +313,4 @@ doesn't need to be changed.
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
Lock all OSD memory to prevent it from being unloaded into swap with
|
||||
mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||
|
||||
## auto_scrub
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Data scrubbing is the process of background verification of copies to find
|
||||
and repair corrupted blocks. It's not run automatically by default since
|
||||
it's a new feature. Set this parameter to true to enable automatic scrubs.
|
||||
|
||||
This parameter makes OSDs automatically schedule data scrubbing of clean PGs
|
||||
every `scrub_interval` (see below). You can also start/schedule scrubbing
|
||||
manually by setting `next_scrub` JSON key to the desired UNIX time of the
|
||||
next scrub in `/pg/history/...` values in etcd.
|
||||
|
||||
## no_scrub
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
- Can be changed online: yes
|
||||
|
||||
Temporarily disable scrubbing and stop running scrubs.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Type: string
|
||||
- Default: 30d
|
||||
- Can be changed online: yes
|
||||
|
||||
Default automatic scrubbing interval for all pools. Numbers without suffix
|
||||
are treated as seconds, possible unit suffixes include 's' (seconds),
|
||||
'm' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
|
||||
|
||||
## scrub_queue_depth
|
||||
|
||||
- Type: integer
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of parallel scrubbing operations per one OSD.
|
||||
|
||||
## scrub_sleep
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 0
|
||||
- Can be changed online: yes
|
||||
|
||||
Additional interval between two consecutive scrubbing operations on one OSD.
|
||||
Can be used to slow down scrubbing if it affects user load too much.
|
||||
|
||||
## scrub_list_limit
|
||||
|
||||
- Type: integer
|
||||
- Default: 1000
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
|
||||
## scrub_find_best
|
||||
|
||||
- Type: boolean
|
||||
- Default: true
|
||||
- Can be changed online: yes
|
||||
|
||||
Find and automatically restore best versions of objects with unmatched
|
||||
copies. In replicated setups, the best version is the version with most
|
||||
matching replicas. In EC setups, the best version is the subset of data
|
||||
and parity chunks without mismatches.
|
||||
|
||||
The hypothetical situation where you might want to disable it is when
|
||||
you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
|
||||
corrupt an object in the same way (for example, zero it out) and only
|
||||
1 HDD will remain good. In this case disabling scrub_find_best may help
|
||||
you to recover the data! See also scrub_ec_max_bruteforce below.
|
||||
|
||||
## scrub_ec_max_bruteforce
|
||||
|
||||
- Type: integer
|
||||
- Default: 100
|
||||
- Can be changed online: yes
|
||||
|
||||
Vitastor can locate corrupted chunks in EC setups with more than 1 parity
|
||||
chunk by brute-forcing all possible error locations. This configuration
|
||||
value limits the maximum number of checked combinations. You can try to
|
||||
increase it if you have EC N+K setup with N and K large enough for
|
||||
combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
|
||||
than the default 100.
|
||||
|
||||
If there are too many possible combinations or if multiple combinations give
|
||||
correct results then objects are marked inconsistent and aren't recovered
|
||||
automatically.
|
||||
|
||||
In replicated setups bruteforcing isn't needed, Vitastor just assumes that
|
||||
the variant with most available equal copies is correct. For example, if
|
||||
you have 3 replicas and 1 of them differs, this one is considered to be
|
||||
corrupted. But if there is no "best" version with more copies than all
|
||||
others have then the object is also marked as inconsistent.
|
||||
Lock all OSD memory to prevent it from being unloaded into swap with mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||
|
@@ -8,8 +8,7 @@
|
||||
|
||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
||||
момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
|
||||
изменения конфигурации в etcd.
|
||||
момент с перезапуском OSD.
|
||||
|
||||
- [etcd_report_interval](#etcd_report_interval)
|
||||
- [run_primary](#run_primary)
|
||||
@@ -40,14 +39,6 @@
|
||||
- [throttle_target_parallelism](#throttle_target_parallelism)
|
||||
- [throttle_threshold_us](#throttle_threshold_us)
|
||||
- [osd_memlock](#osd_memlock)
|
||||
- [auto_scrub](#auto_scrub)
|
||||
- [no_scrub](#no_scrub)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [scrub_queue_depth](#scrub_queue_depth)
|
||||
- [scrub_sleep](#scrub_sleep)
|
||||
- [scrub_list_limit](#scrub_list_limit)
|
||||
- [scrub_find_best](#scrub_find_best)
|
||||
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
@@ -102,7 +93,6 @@ RUNNING), подходящий под заданную маску. Также н
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 5
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временной интервал отправки автоматических fsync-ов (операций очистки кэша)
|
||||
каждым OSD для случая, когда режим immediate_commit отключён. fsync-и нужны
|
||||
@@ -115,7 +105,6 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
- Можно менять на лету: да
|
||||
|
||||
Аналогично autosync_interval, но задаёт не временной интервал, а
|
||||
максимальное количество незафиксированных операций записи перед
|
||||
@@ -125,7 +114,6 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 4
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число операций восстановления на одном первичном OSD в любой
|
||||
момент времени. На данный момент единственный параметр, который можно менять
|
||||
@@ -136,7 +124,6 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
- Можно менять на лету: да
|
||||
|
||||
Число операций восстановления перед переключением на восстановление другой PG.
|
||||
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||
@@ -148,7 +135,6 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 16
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число операций восстановления перед дополнительным fsync.
|
||||
|
||||
@@ -164,7 +150,6 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Отключить автоматическое фоновое восстановление объектов. Обратите внимание,
|
||||
что эта опция не отключает восстановление объектов, происходящее при
|
||||
@@ -175,7 +160,6 @@ OSD.
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Отключить фоновое перемещение объектов между разными OSD. Отключение
|
||||
означает, что PG, находящиеся в состоянии `has_misplaced`, будут оставлены
|
||||
@@ -185,7 +169,6 @@ OSD.
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 3
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временной интервал, с которым OSD печатают простую человекочитаемую
|
||||
статистику выполнения операций в стандартный вывод.
|
||||
@@ -194,7 +177,6 @@ OSD.
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 10
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временной интервал, с которым OSD выводят в стандартный вывод список
|
||||
медленных или зависших операций, если таковые имеются. Также время, при
|
||||
@@ -204,7 +186,6 @@ OSD.
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 60
|
||||
- Можно менять на лету: да
|
||||
|
||||
Число секунд, через которое удалённые инод удаляется и из статистики OSD.
|
||||
|
||||
@@ -212,7 +193,6 @@ OSD.
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число одновременных клиентских операций записи на один OSD.
|
||||
Операции, превышающие этот лимит, не исполняются сразу, а сохраняются во
|
||||
@@ -222,7 +202,6 @@ OSD.
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Flusher - это микро-поток (корутина), которая копирует данные из журнала в
|
||||
основную область устройства данных. Их число настраивается динамически между
|
||||
@@ -232,7 +211,6 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 256
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число микро-потоков очистки журнала (см. выше min_flusher_count).
|
||||
|
||||
@@ -292,7 +270,6 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Разрешить мягкое ограничение скорости журналируемой записи. Полезно для
|
||||
гибридных OSD с быстрыми устройствами метаданных и медленными устройствами
|
||||
@@ -311,7 +288,6 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 100
|
||||
- Можно менять на лету: да
|
||||
|
||||
Расчётное максимальное число ограничиваемых операций в секунду при условии
|
||||
отсутствия свободного места в журнале. Устанавливайте приблизительно равным
|
||||
@@ -322,7 +298,6 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 100
|
||||
- Можно менять на лету: да
|
||||
|
||||
Расчётный максимальный размер в МБ/с ограничиваемых операций в секунду при
|
||||
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||
@@ -333,7 +308,6 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Расчётный максимальный параллелизм ограничиваемых операций в секунду при
|
||||
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
|
||||
@@ -344,7 +318,6 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
|
||||
- Тип: микросекунды
|
||||
- Значение по умолчанию: 50
|
||||
- Можно менять на лету: да
|
||||
|
||||
Минимальная применимая к ограничиваемым операциям задержка. Обычно не
|
||||
требует изменений.
|
||||
@@ -354,113 +327,4 @@ Flusher - это микро-поток (корутина), которая коп
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
|
||||
в пространство подкачки. Требует достаточного значения ulimit -l (лимита
|
||||
заблокированной памяти).
|
||||
|
||||
## auto_scrub
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
|
||||
находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
|
||||
запускаются автоматически, так как являются новой функцией. Чтобы включить
|
||||
автоматическое планирование скрабов, установите данный параметр в true.
|
||||
|
||||
Включённый параметр заставляет OSD автоматически планировать фоновую
|
||||
проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
|
||||
запустить или запланировать проверку вручную, установив значение ключа JSON
|
||||
`next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
|
||||
желаемой проверки.
|
||||
|
||||
## no_scrub
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
- Можно менять на лету: да
|
||||
|
||||
Временно отключить и остановить запущенные скрабы.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Тип: строка
|
||||
- Значение по умолчанию: 30d
|
||||
- Можно менять на лету: да
|
||||
|
||||
Интервал автоматической фоновой проверки по умолчанию для всех пулов.
|
||||
Значения без указанной единицы измерения считаются в секундах, допустимые
|
||||
символы единиц измерения в конце: 's' (секунды),
|
||||
'm' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
|
||||
|
||||
## scrub_queue_depth
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Число параллельных операций фоновой проверки на один OSD.
|
||||
|
||||
## scrub_sleep
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 0
|
||||
- Можно менять на лету: да
|
||||
|
||||
Дополнительный интервал ожидания после фоновой проверки каждого объекта на
|
||||
одном OSD. Может использоваться для замедления скраба, если он слишком
|
||||
сильно влияет на пользовательскую нагрузку.
|
||||
|
||||
## scrub_list_limit
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Размер загружаемых за одну операцию списков объектов в процессе фоновой
|
||||
проверки.
|
||||
|
||||
## scrub_find_best
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: true
|
||||
- Можно менять на лету: да
|
||||
|
||||
Находить и автоматически восстанавливать "лучшие версии" объектов с
|
||||
несовпадающими копиями/частями. При использовании репликации "лучшая"
|
||||
версия - версия, доступная в большем числе экземпляров, чем другие. При
|
||||
использовании кодов коррекции ошибок "лучшая" версия - это подмножество
|
||||
частей данных и чётности, полностью соответствующих друг другу.
|
||||
|
||||
Гипотетическая ситуация, в которой вы можете захотеть отключить этот
|
||||
поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
|
||||
незаметно и одинаково повредить данные одного и того же объекта, например,
|
||||
занулив его, и только 1 диск останется неповреждённым. В этой ситуации
|
||||
отключение этого параметра поможет вам восстановить данные! Смотрите также
|
||||
описание следующего параметра - scrub_ec_max_bruteforce.
|
||||
|
||||
## scrub_ec_max_bruteforce
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 100
|
||||
- Можно менять на лету: да
|
||||
|
||||
Vitastor старается определить повреждённые части объектов при использовании
|
||||
EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
|
||||
всех возможных комбинаций ошибочных частей. Данное значение конфигурации
|
||||
ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
|
||||
его, если используете схему кодирования EC N+K с N и K, достаточно большими
|
||||
для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
|
||||
было больше, чем стандартное значение 100.
|
||||
|
||||
Если возможных комбинаций слишком много или если корректная комбинаций не
|
||||
определяется однозначно, объекты помечаются неконсистентными (inconsistent)
|
||||
и не восстанавливаются автоматически.
|
||||
|
||||
При использовании репликации перебор не нужен, Vitastor просто предполагает,
|
||||
что вариант объекта с наибольшим количеством одинаковых копий корректен.
|
||||
Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
|
||||
считается некорректной. Однако, если "лучшую" версию с числом доступных
|
||||
копий большим, чем у всех других версий, найти невозможно, то объект тоже
|
||||
маркируется неконсистентным.
|
||||
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку в пространство подкачки. Требует достаточного значения ulimit -l (лимита заблокированной памяти).
|
||||
|
@@ -40,7 +40,6 @@ Parameters:
|
||||
- [root_node](#root_node)
|
||||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -273,13 +272,6 @@ Specifies OSD tags to prefer putting primary OSDs in this pool to.
|
||||
Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one
|
||||
of the OSDs containing a data chunk for a PG.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Type: time interval (number + unit s/m/h/d/M/y)
|
||||
|
||||
Automatic scrubbing interval for this pool. Overrides
|
||||
[global scrub_interval setting](osd.en.md#scrub_interval).
|
||||
|
||||
# Examples
|
||||
|
||||
## Replicated pool
|
||||
|
@@ -39,7 +39,6 @@
|
||||
- [root_node](#root_node)
|
||||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
|
||||
Примеры:
|
||||
|
||||
@@ -277,13 +276,6 @@ PG в Vitastor эферемерны, то есть вы можете менят
|
||||
для PG этого пула. Имейте в виду, что для EC-пулов Vitastor также всегда
|
||||
предпочитает помещать первичный OSD на один из OSD с данными, а не с чётностью.
|
||||
|
||||
## scrub_interval
|
||||
|
||||
- Тип: временной интервал (число + единица измерения s/m/h/d/M/y)
|
||||
|
||||
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
|
||||
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
|
||||
|
||||
# Примеры
|
||||
|
||||
## Реплицированный пул
|
||||
|
@@ -11,21 +11,13 @@
|
||||
- name: etcd_address
|
||||
type: string or array of strings
|
||||
type_ru: строка или массив строк
|
||||
online: true
|
||||
info: |
|
||||
etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
|
||||
specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
Note that https is not supported for etcd connections yet.
|
||||
|
||||
etcd connection endpoints can be changed online by updating global
|
||||
configuration in etcd itself - this allows to switch the cluster to new
|
||||
etcd addresses without downtime.
|
||||
info_ru: |
|
||||
Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
|
||||
или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
|
||||
|
||||
Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
|
||||
самом etcd - это позволяет переключить кластер на новые etcd без остановки.
|
||||
- name: etcd_prefix
|
||||
type: string
|
||||
default: "/vitastor"
|
||||
@@ -39,6 +31,5 @@
|
||||
- name: log_level
|
||||
type: int
|
||||
default: 0
|
||||
online: true
|
||||
info: Log level. Raise if you want more verbose output.
|
||||
info_ru: Уровень логгирования. Повысьте, если хотите более подробный вывод.
|
||||
|
@@ -14,7 +14,6 @@ const L = {
|
||||
toc_config: '[Configuration](../config.en.md)',
|
||||
toc_usage: 'Usage',
|
||||
toc_performance: 'Performance',
|
||||
online: 'Can be changed online: yes',
|
||||
},
|
||||
ru: {
|
||||
Documentation: 'Документация',
|
||||
@@ -29,7 +28,6 @@ const L = {
|
||||
toc_config: '[Конфигурация](../config.ru.md)',
|
||||
toc_usage: 'Использование',
|
||||
toc_performance: 'Производительность',
|
||||
online: 'Можно менять на лету: да',
|
||||
},
|
||||
};
|
||||
const types = {
|
||||
@@ -72,8 +70,6 @@ for (const file of params_files)
|
||||
out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`;
|
||||
if (c.min !== undefined)
|
||||
out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`;
|
||||
if (c.online)
|
||||
out += `- ${L[lang]['online'] || 'Can be changed online: yes'}\n`;
|
||||
out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, '');
|
||||
}
|
||||
const head = fs.readFileSync(__dirname+'/'+file+'.'+lang+'.md', { encoding: 'utf-8' });
|
||||
|
@@ -164,21 +164,18 @@
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: Interval before attempting to reconnect to an unavailable OSD.
|
||||
info_ru: Время ожидания перед повторной попыткой соединиться с недоступным OSD.
|
||||
- name: peer_connect_timeout
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: Timeout for OSD connection attempts.
|
||||
info_ru: Максимальное время ожидания попытки соединения с OSD.
|
||||
- name: osd_idle_timeout
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
OSD connection inactivity time after which clients and other OSDs send
|
||||
keepalive requests to check state of the connection.
|
||||
@@ -189,7 +186,6 @@
|
||||
type: sec
|
||||
min: 1
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
|
||||
within this time, the connection to it is dropped and a reconnection attempt
|
||||
@@ -202,7 +198,6 @@
|
||||
type: ms
|
||||
min: 50
|
||||
default: 500
|
||||
online: true
|
||||
info: |
|
||||
OSDs respond to clients with a special error code when they receive I/O
|
||||
requests for a PG that's not synchronized and started. This parameter sets
|
||||
@@ -216,7 +211,6 @@
|
||||
- name: max_etcd_attempts
|
||||
type: int
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
Maximum number of attempts for etcd requests which can't be retried
|
||||
indefinitely.
|
||||
@@ -226,7 +220,6 @@
|
||||
- name: etcd_quick_timeout
|
||||
type: ms
|
||||
default: 1000
|
||||
online: true
|
||||
info: |
|
||||
Timeout for etcd requests which should complete quickly, like lease refresh.
|
||||
info_ru: |
|
||||
@@ -235,7 +228,6 @@
|
||||
- name: etcd_slow_timeout
|
||||
type: ms
|
||||
default: 5000
|
||||
online: true
|
||||
info: Timeout for etcd requests which are allowed to wait for some time.
|
||||
info_ru: |
|
||||
Максимальное время выполнения запросов к etcd, для которых не обязательно
|
||||
@@ -243,7 +235,6 @@
|
||||
- name: etcd_keepalive_timeout
|
||||
type: sec
|
||||
default: max(30, etcd_report_interval*2)
|
||||
online: true
|
||||
info: |
|
||||
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
|
||||
etcd_report_interval to guarantee that keepalive actually works.
|
||||
@@ -253,7 +244,6 @@
|
||||
- name: etcd_ws_keepalive_timeout
|
||||
type: sec
|
||||
default: 30
|
||||
online: true
|
||||
info: |
|
||||
etcd websocket ping interval required to keep the connection alive and
|
||||
detect disconnections quickly.
|
||||
@@ -262,7 +252,6 @@
|
||||
- name: client_dirty_limit
|
||||
type: int
|
||||
default: 33554432
|
||||
online: true
|
||||
info: |
|
||||
Without immediate_commit=all this parameter sets the limit of "dirty"
|
||||
(not committed by fsync) data allowed by the client before forcing an
|
||||
|
@@ -1,5 +1,4 @@
|
||||
# Runtime OSD Parameters
|
||||
|
||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||
initialization and can be changed - either with an OSD restart or, for some of
|
||||
them, even without restarting by updating configuration in etcd.
|
||||
initialization and can be changed with an OSD restart.
|
||||
|
@@ -2,5 +2,4 @@
|
||||
|
||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
||||
момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
|
||||
изменения конфигурации в etcd.
|
||||
момент с перезапуском OSD.
|
||||
|
@@ -66,7 +66,6 @@
|
||||
- name: autosync_interval
|
||||
type: sec
|
||||
default: 5
|
||||
online: true
|
||||
info: |
|
||||
Time interval at which automatic fsyncs/flushes are issued by each OSD when
|
||||
the immediate_commit mode if disabled. fsyncs are required because without
|
||||
@@ -84,7 +83,6 @@
|
||||
- name: autosync_writes
|
||||
type: int
|
||||
default: 128
|
||||
online: true
|
||||
info: |
|
||||
Same as autosync_interval, but sets the maximum number of uncommitted write
|
||||
operations before issuing an fsync operation internally.
|
||||
@@ -95,7 +93,6 @@
|
||||
- name: recovery_queue_depth
|
||||
type: int
|
||||
default: 4
|
||||
online: true
|
||||
info: |
|
||||
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
@@ -108,7 +105,6 @@
|
||||
- name: recovery_pg_switch
|
||||
type: int
|
||||
default: 128
|
||||
online: true
|
||||
info: |
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
@@ -123,7 +119,6 @@
|
||||
- name: recovery_sync_batch
|
||||
type: int
|
||||
default: 16
|
||||
online: true
|
||||
info: Maximum number of recovery operations before issuing an additional fsync.
|
||||
info_ru: Максимальное число операций восстановления перед дополнительным fsync.
|
||||
- name: readonly
|
||||
@@ -138,7 +133,6 @@
|
||||
- name: no_recovery
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Disable automatic background recovery of objects. Note that it doesn't
|
||||
affect implicit recovery of objects happening during writes - a write is
|
||||
@@ -151,7 +145,6 @@
|
||||
- name: no_rebalance
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Disable background movement of data between different OSDs. Disabling it
|
||||
means that PGs in the `has_misplaced` state will be left in it indefinitely.
|
||||
@@ -162,7 +155,6 @@
|
||||
- name: print_stats_interval
|
||||
type: sec
|
||||
default: 3
|
||||
online: true
|
||||
info: |
|
||||
Time interval at which OSDs print simple human-readable operation
|
||||
statistics on stdout.
|
||||
@@ -172,7 +164,6 @@
|
||||
- name: slow_log_interval
|
||||
type: sec
|
||||
default: 10
|
||||
online: true
|
||||
info: |
|
||||
Time interval at which OSDs dump slow or stuck operations on stdout, if
|
||||
they're any. Also it's the time after which an operation is considered
|
||||
@@ -184,7 +175,6 @@
|
||||
- name: inode_vanish_time
|
||||
type: sec
|
||||
default: 60
|
||||
online: true
|
||||
info: |
|
||||
Number of seconds after which a deleted inode is removed from OSD statistics.
|
||||
info_ru: |
|
||||
@@ -192,7 +182,6 @@
|
||||
- name: max_write_iodepth
|
||||
type: int
|
||||
default: 128
|
||||
online: true
|
||||
info: |
|
||||
Parallel client write operation limit per one OSD. Operations that exceed
|
||||
this limit are pushed to a temporary queue instead of being executed
|
||||
@@ -204,7 +193,6 @@
|
||||
- name: min_flusher_count
|
||||
type: int
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Flusher is a micro-thread that moves data from the journal to the data
|
||||
area of the device. Their number is auto-tuned between minimum and maximum.
|
||||
@@ -216,7 +204,6 @@
|
||||
- name: max_flusher_count
|
||||
type: int
|
||||
default: 256
|
||||
online: true
|
||||
info: |
|
||||
Maximum number of journal flushers (see above min_flusher_count).
|
||||
info_ru: |
|
||||
@@ -297,7 +284,6 @@
|
||||
- name: throttle_small_writes
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Enable soft throttling of small journaled writes. Useful for hybrid OSDs
|
||||
with fast journal/metadata devices and slow data devices. The idea is that
|
||||
@@ -326,7 +312,6 @@
|
||||
- name: throttle_target_iops
|
||||
type: int
|
||||
default: 100
|
||||
online: true
|
||||
info: |
|
||||
Target maximum number of throttled operations per second under the condition
|
||||
of full journal. Set it to approximate random write iops of your data devices
|
||||
@@ -339,7 +324,6 @@
|
||||
- name: throttle_target_mbs
|
||||
type: int
|
||||
default: 100
|
||||
online: true
|
||||
info: |
|
||||
Target maximum bandwidth in MB/s of throttled operations per second under
|
||||
the condition of full journal. Set it to approximate linear write
|
||||
@@ -352,7 +336,6 @@
|
||||
- name: throttle_target_parallelism
|
||||
type: int
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Target maximum parallelism of throttled operations under the condition of
|
||||
full journal. Set it to approximate internal parallelism of your data
|
||||
@@ -365,7 +348,6 @@
|
||||
- name: throttle_threshold_us
|
||||
type: us
|
||||
default: 50
|
||||
online: true
|
||||
info: |
|
||||
Minimal computed delay to be applied to throttled operations. Usually
|
||||
doesn't need to be changed.
|
||||
@@ -375,151 +357,10 @@
|
||||
- name: osd_memlock
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
info: >
|
||||
Lock all OSD memory to prevent it from being unloaded into swap with
|
||||
mlockall(). Requires sufficient ulimit -l (max locked memory).
|
||||
info_ru: |
|
||||
info_ru: >
|
||||
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
|
||||
в пространство подкачки. Требует достаточного значения ulimit -l (лимита
|
||||
заблокированной памяти).
|
||||
- name: auto_scrub
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Data scrubbing is the process of background verification of copies to find
|
||||
and repair corrupted blocks. It's not run automatically by default since
|
||||
it's a new feature. Set this parameter to true to enable automatic scrubs.
|
||||
|
||||
This parameter makes OSDs automatically schedule data scrubbing of clean PGs
|
||||
every `scrub_interval` (see below). You can also start/schedule scrubbing
|
||||
manually by setting `next_scrub` JSON key to the desired UNIX time of the
|
||||
next scrub in `/pg/history/...` values in etcd.
|
||||
info_ru: |
|
||||
Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
|
||||
находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
|
||||
запускаются автоматически, так как являются новой функцией. Чтобы включить
|
||||
автоматическое планирование скрабов, установите данный параметр в true.
|
||||
|
||||
Включённый параметр заставляет OSD автоматически планировать фоновую
|
||||
проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
|
||||
запустить или запланировать проверку вручную, установив значение ключа JSON
|
||||
`next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
|
||||
желаемой проверки.
|
||||
- name: no_scrub
|
||||
type: bool
|
||||
default: false
|
||||
online: true
|
||||
info: |
|
||||
Temporarily disable scrubbing and stop running scrubs.
|
||||
info_ru: |
|
||||
Временно отключить и остановить запущенные скрабы.
|
||||
- name: scrub_interval
|
||||
type: string
|
||||
default: 30d
|
||||
online: true
|
||||
info: |
|
||||
Default automatic scrubbing interval for all pools. Numbers without suffix
|
||||
are treated as seconds, possible unit suffixes include 's' (seconds),
|
||||
'm' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
|
||||
info_ru: |
|
||||
Интервал автоматической фоновой проверки по умолчанию для всех пулов.
|
||||
Значения без указанной единицы измерения считаются в секундах, допустимые
|
||||
символы единиц измерения в конце: 's' (секунды),
|
||||
'm' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
|
||||
- name: scrub_queue_depth
|
||||
type: int
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Number of parallel scrubbing operations per one OSD.
|
||||
info_ru: |
|
||||
Число параллельных операций фоновой проверки на один OSD.
|
||||
- name: scrub_sleep
|
||||
type: ms
|
||||
default: 0
|
||||
online: true
|
||||
info: |
|
||||
Additional interval between two consecutive scrubbing operations on one OSD.
|
||||
Can be used to slow down scrubbing if it affects user load too much.
|
||||
info_ru: |
|
||||
Дополнительный интервал ожидания после фоновой проверки каждого объекта на
|
||||
одном OSD. Может использоваться для замедления скраба, если он слишком
|
||||
сильно влияет на пользовательскую нагрузку.
|
||||
- name: scrub_list_limit
|
||||
type: int
|
||||
default: 1000
|
||||
online: true
|
||||
info: |
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
info_ru: |
|
||||
Размер загружаемых за одну операцию списков объектов в процессе фоновой
|
||||
проверки.
|
||||
- name: scrub_find_best
|
||||
type: bool
|
||||
default: true
|
||||
online: true
|
||||
info: |
|
||||
Find and automatically restore best versions of objects with unmatched
|
||||
copies. In replicated setups, the best version is the version with most
|
||||
matching replicas. In EC setups, the best version is the subset of data
|
||||
and parity chunks without mismatches.
|
||||
|
||||
The hypothetical situation where you might want to disable it is when
|
||||
you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
|
||||
corrupt an object in the same way (for example, zero it out) and only
|
||||
1 HDD will remain good. In this case disabling scrub_find_best may help
|
||||
you to recover the data! See also scrub_ec_max_bruteforce below.
|
||||
info_ru: |
|
||||
Находить и автоматически восстанавливать "лучшие версии" объектов с
|
||||
несовпадающими копиями/частями. При использовании репликации "лучшая"
|
||||
версия - версия, доступная в большем числе экземпляров, чем другие. При
|
||||
использовании кодов коррекции ошибок "лучшая" версия - это подмножество
|
||||
частей данных и чётности, полностью соответствующих друг другу.
|
||||
|
||||
Гипотетическая ситуация, в которой вы можете захотеть отключить этот
|
||||
поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
|
||||
незаметно и одинаково повредить данные одного и того же объекта, например,
|
||||
занулив его, и только 1 диск останется неповреждённым. В этой ситуации
|
||||
отключение этого параметра поможет вам восстановить данные! Смотрите также
|
||||
описание следующего параметра - scrub_ec_max_bruteforce.
|
||||
- name: scrub_ec_max_bruteforce
|
||||
type: int
|
||||
default: 100
|
||||
online: true
|
||||
info: |
|
||||
Vitastor can locate corrupted chunks in EC setups with more than 1 parity
|
||||
chunk by brute-forcing all possible error locations. This configuration
|
||||
value limits the maximum number of checked combinations. You can try to
|
||||
increase it if you have EC N+K setup with N and K large enough for
|
||||
combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
|
||||
than the default 100.
|
||||
|
||||
If there are too many possible combinations or if multiple combinations give
|
||||
correct results then objects are marked inconsistent and aren't recovered
|
||||
automatically.
|
||||
|
||||
In replicated setups bruteforcing isn't needed, Vitastor just assumes that
|
||||
the variant with most available equal copies is correct. For example, if
|
||||
you have 3 replicas and 1 of them differs, this one is considered to be
|
||||
corrupted. But if there is no "best" version with more copies than all
|
||||
others have then the object is also marked as inconsistent.
|
||||
info_ru: |
|
||||
Vitastor старается определить повреждённые части объектов при использовании
|
||||
EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
|
||||
всех возможных комбинаций ошибочных частей. Данное значение конфигурации
|
||||
ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
|
||||
его, если используете схему кодирования EC N+K с N и K, достаточно большими
|
||||
для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
|
||||
было больше, чем стандартное значение 100.
|
||||
|
||||
Если возможных комбинаций слишком много или если корректная комбинаций не
|
||||
определяется однозначно, объекты помечаются неконсистентными (inconsistent)
|
||||
и не восстанавливаются автоматически.
|
||||
|
||||
При использовании репликации перебор не нужен, Vitastor просто предполагает,
|
||||
что вариант объекта с наибольшим количеством одинаковых копий корректен.
|
||||
Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
|
||||
считается некорректной. Однако, если "лучшую" версию с числом доступных
|
||||
копий большим, чем у всех других версий, найти невозможно, то объект тоже
|
||||
маркируется неконсистентным.
|
||||
|
@@ -45,10 +45,3 @@
|
||||
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
|
||||
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
||||
- node.js 10 or newer
|
||||
|
||||
## Version archive
|
||||
|
||||
All previous Vitastor and other components (QEMU, etcd...) package builds
|
||||
can be found here:
|
||||
|
||||
https://vitastor.io/archive/
|
||||
|
@@ -44,10 +44,3 @@
|
||||
- etcd 3.4.15 или новее. Более старые версии не будут работать из-за разных багов,
|
||||
например, [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
||||
- node.js 10 или новее
|
||||
|
||||
## Архив предыдущих версий
|
||||
|
||||
Все предыдущие сборки пакетов Vitastor и других компонентов, таких, как QEMU
|
||||
и etcd, можно скачать по следующей ссылке:
|
||||
|
||||
https://vitastor.io/archive/
|
||||
|
@@ -29,7 +29,6 @@
|
||||
- Snapshots and copy-on-write image clones
|
||||
- [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
|
||||
- [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
|
||||
- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)
|
||||
|
||||
## Plugins and tools
|
||||
|
||||
@@ -55,6 +54,7 @@ The following features are planned for the future:
|
||||
- iSCSI proxy
|
||||
- Multi-threaded client
|
||||
- Faster failover
|
||||
- Scrubbing without checksums (verification of replicas)
|
||||
- Checksums
|
||||
- Tiered storage (SSD caching)
|
||||
- NVDIMM support
|
||||
|
@@ -31,7 +31,6 @@
|
||||
- Снапшоты и copy-on-write клоны
|
||||
- [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
|
||||
- [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
|
||||
- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)
|
||||
|
||||
## Драйверы и инструменты
|
||||
|
||||
@@ -55,6 +54,7 @@
|
||||
- iSCSI-прокси
|
||||
- Многопоточный клиент
|
||||
- Более быстрое переключение при отказах
|
||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||
- Контрольные суммы
|
||||
- Поддержка SSD-кэширования (tiered storage)
|
||||
- Поддержка NVDIMM
|
||||
|
@@ -20,8 +20,6 @@ It supports the following commands:
|
||||
- [flatten](#flatten)
|
||||
- [rm-data](#rm-data)
|
||||
- [merge-data](#merge-data)
|
||||
- [describe](#describe)
|
||||
- [fix](#fix)
|
||||
- [alloc-osd](#alloc-osd)
|
||||
- [rm-osd](#rm-osd)
|
||||
|
||||
@@ -176,51 +174,6 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
|
||||
`<to>` must be a child of `<from>` and `<target>` may be one of the layers between
|
||||
`<from>` and `<to>`, including `<from>` and `<to>`.
|
||||
|
||||
## describe
|
||||
|
||||
`vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>]
|
||||
[--inode <ino>] [--min-inode <ino>] [--max-inode <ino>]
|
||||
[--min-offset <offset>] [--max-offset <offset>]`
|
||||
|
||||
Describe unclean object locations in the cluster.
|
||||
|
||||
```
|
||||
--osds <osds>
|
||||
Only list objects from primary OSD(s) <osds>.
|
||||
--object-state <states>
|
||||
Only list objects in given state(s). State(s) may include:
|
||||
degraded, misplaced, incomplete, corrupted, inconsistent.
|
||||
--pool <pool name or number>
|
||||
Only list objects in the given pool.
|
||||
--inode, --min-inode, --max-inode
|
||||
Restrict listing to specific inode numbers.
|
||||
--min-offset, --max-offset
|
||||
Restrict listing to specific offsets inside inodes.
|
||||
```
|
||||
|
||||
## fix
|
||||
|
||||
`vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]`
|
||||
|
||||
Fix inconsistent objects in the cluster by deleting some copies.
|
||||
|
||||
```
|
||||
--objects <objects>
|
||||
Objects to fix, either in plain text or JSON format. If not specified,
|
||||
object list will be read from STDIN in one of the same formats.
|
||||
Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...
|
||||
JSON format: [{"inode":"0x...","stripe":"0x..."},...]
|
||||
--bad-osds <osds>
|
||||
Remove inconsistent copies/parts of objects from these OSDs, effectively
|
||||
marking them bad and allowing Vitastor to recover objects from other copies.
|
||||
--part <number>
|
||||
Only remove EC part <number> (from 0 to pg_size-1), required for extreme
|
||||
edge cases where one OSD has multiple parts of a EC object.
|
||||
--check no
|
||||
Do not recheck that requested objects are actually inconsistent,
|
||||
delete requested copies/parts anyway.
|
||||
```
|
||||
|
||||
## alloc-osd
|
||||
|
||||
`vitastor-cli alloc-osd`
|
||||
|
@@ -184,59 +184,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
в целевой образ `<target>`. `<to>` должен быть дочерним образом `<from>`, а `<target>`
|
||||
должен быть одним из слоёв между `<from>` и `<to>`, включая сами `<from>` и `<to>`.
|
||||
|
||||
## describe
|
||||
|
||||
`vitastor-cli describe [--osds <osds>] [--object-state <состояния>] [--pool <пул>]
|
||||
[--inode <номер>] [--min-inode <номер>] [--max-inode <номер>]
|
||||
[--min-offset <смещение>] [--max-offset <смещение>]`
|
||||
|
||||
Описать состояние "грязных" объектов в кластере, то есть таких объектов, копии
|
||||
или части которых хранятся на наборе OSD, не равном целевому.
|
||||
|
||||
```
|
||||
--osds <osds>
|
||||
Перечислять только объекты с первичных OSD из списка <osds>.
|
||||
--object-state <состояния>
|
||||
Перечислять только объекты в указанных состояниях. Возможные состояния
|
||||
объектов:
|
||||
- degraded - деградированная избыточность
|
||||
- misplaced - перемещённый
|
||||
- incomplete - нечитаемый из-за потери большего числа частей, чем допустимо
|
||||
- corrupted - с одной или более повреждённой частью
|
||||
- inconsistent - неконсистентный, с неоднозначным расхождением копий/частей
|
||||
--pool <имя или ID пула>
|
||||
Перечислять только объекты из заданного пула.
|
||||
--inode, --min-inode, --max-inode
|
||||
Перечислять только объекты из указанных номеров инодов (образов).
|
||||
--min-offset, --max-offset
|
||||
Перечислять только объекты с заданных смещений внутри образов.
|
||||
```
|
||||
|
||||
## fix
|
||||
|
||||
`vitastor-cli fix [--objects <объекты>] [--bad-osds <osds>] [--part <номер>] [--check no]`
|
||||
|
||||
Исправить неконсистентные (неоднозначные) объекты путём удаления части копий.
|
||||
|
||||
```
|
||||
--objects <объекты>
|
||||
Объекты для исправления - в простом текстовом или JSON формате. Если опция
|
||||
не указана, список объектов читается со стандартного ввода в тех же форматах.
|
||||
Простой формат: 0x<инод>:0x<смещение> <любой разделитель> 0x<инод>:0x<смещение> ...
|
||||
Формат JSON: [{"inode":"0x<инод>","stripe":"0x<смещение>"},...]
|
||||
--bad-osds <osds>
|
||||
Удалить неконсистентные копии/части объектов с данных OSD, таким образом
|
||||
признавая потерю этих копий и позволяя Vitastor-у восстановить объекты из
|
||||
других копий.
|
||||
--part <номер>
|
||||
Удалить только части EC с заданным номером (от 0 до pg_size-1). Нужно только
|
||||
в редких граничных случаях, когда один и тот же OSD содержит несколько частей
|
||||
одного EC-объекта.
|
||||
--check no
|
||||
Не перепроверять, что заданные объекты действительно в неконсистентном
|
||||
состоянии и просто удалять заданные части.
|
||||
```
|
||||
|
||||
## alloc-osd
|
||||
|
||||
`vitastor-cli alloc-osd`
|
||||
|
@@ -25,23 +25,6 @@ It will output a block device name like /dev/nbd0 which you can then use as a no
|
||||
|
||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
||||
|
||||
Additional options for map command:
|
||||
|
||||
* `--nbd_timeout 30` \
|
||||
Timeout for I/O operations in seconds after exceeding which the kernel stops
|
||||
the device. You can set it to 0 to disable the timeout, but beware that you
|
||||
won't be able to stop the device at all if vitastor-nbd process dies.
|
||||
* `--nbd_max_devices 64 --nbd_max_part 3` \
|
||||
Options for the `nbd` kernel module when modprobing it (`nbds_max` and `max_part`).
|
||||
note that maximum allowed (nbds_max)*(1+max_part) is 256.
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Write log messages to the specified file instead of dropping them (in background mode)
|
||||
or printing them to the standard output (in foreground mode).
|
||||
* `--dev_num N` \
|
||||
Use the specified device /dev/nbdN instead of automatic selection.
|
||||
* `--foreground 1` \
|
||||
Stay in foreground, do not daemonize.
|
||||
|
||||
## Unmap image
|
||||
|
||||
To unmap the device run:
|
||||
@@ -49,27 +32,3 @@ To unmap the device run:
|
||||
```
|
||||
vitastor-nbd unmap /dev/nbd0
|
||||
```
|
||||
|
||||
## List mapped images
|
||||
|
||||
```
|
||||
vitastor-nbd ls [--json]
|
||||
```
|
||||
|
||||
Example output (normal format):
|
||||
|
||||
```
|
||||
/dev/nbd0
|
||||
image: bench
|
||||
pid: 584536
|
||||
|
||||
/dev/nbd1
|
||||
image: bench1
|
||||
pid: 584546
|
||||
```
|
||||
|
||||
Example output (JSON format):
|
||||
|
||||
```
|
||||
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
||||
|
@@ -30,27 +30,6 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
Дополнительные опции для команды подключения NBD-устройства:
|
||||
|
||||
* `--nbd_timeout 30` \
|
||||
Максимальное время выполнения любой операции чтения/записи в секундах, при
|
||||
превышении которого ядро остановит NBD-устройство. Вы можете установить опцию
|
||||
в 0, чтобы отключить ограничение времени, но имейте в виду, что в этом случае
|
||||
вы вообще не сможете отключить NBD-устройство при нештатном завершении процесса
|
||||
vitastor-nbd.
|
||||
* `--nbd_max_devices 64 --nbd_max_part 3` \
|
||||
Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd
|
||||
(`nbds_max` и `max_part`). Имейте в виду, что (nbds_max)*(1+max_part)
|
||||
обычно не должно превышать 256.
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Писать сообщения о процессе работы в заданный файл, вместо пропуска их
|
||||
при фоновом режиме запуска или печати на стандартный вывод при запуске
|
||||
в консоли с `--foreground 1`.
|
||||
* `--dev_num N` \
|
||||
Использовать заданное устройство `/dev/nbdN` вместо автоматического подбора.
|
||||
* `--foreground 1` \
|
||||
Не уводить процесс в фоновый режим.
|
||||
|
||||
## Отключить устройство
|
||||
|
||||
Для отключения устройства выполните:
|
||||
@@ -58,27 +37,3 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
```
|
||||
vitastor-nbd unmap /dev/nbd0
|
||||
```
|
||||
|
||||
## Вывести подключённые устройства
|
||||
|
||||
```
|
||||
vitastor-nbd ls [--json]
|
||||
```
|
||||
|
||||
Пример вывода в обычном формате:
|
||||
|
||||
```
|
||||
/dev/nbd0
|
||||
image: bench
|
||||
pid: 584536
|
||||
|
||||
/dev/nbd1
|
||||
image: bench1
|
||||
pid: 584546
|
||||
```
|
||||
|
||||
Пример вывода в JSON-формате:
|
||||
|
||||
```
|
||||
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
||||
|
@@ -50,7 +50,8 @@ async function lp_solve(text)
|
||||
return { score, vars };
|
||||
}
|
||||
|
||||
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
|
||||
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, hier_sizes = null,
|
||||
max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
|
||||
{
|
||||
if (!pg_count || !osd_tree)
|
||||
{
|
||||
@@ -58,7 +59,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
|
||||
}
|
||||
const all_weights = Object.assign({}, ...Object.values(osd_tree));
|
||||
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
|
||||
const all_pgs = Object.values(random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout));
|
||||
const pg_per_osd = {};
|
||||
for (const pg of all_pgs)
|
||||
{
|
||||
@@ -216,39 +217,45 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
|
||||
return move_weights;
|
||||
}
|
||||
|
||||
function add_valid_previous(osd_tree, prev_weights, all_pgs)
|
||||
function build_parent_per_leaf(osd_tree, res = {}, parents = [])
|
||||
{
|
||||
for (const item in osd_tree)
|
||||
{
|
||||
if (osd_tree[item] instanceof Object)
|
||||
build_parent_per_leaf(osd_tree[item], res, [ ...parents, item ]);
|
||||
else
|
||||
res[item] = parents;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
function add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes)
|
||||
{
|
||||
// Add previous combinations that are still valid
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const host_per_osd = {};
|
||||
for (const host in osd_tree)
|
||||
{
|
||||
for (const osd in osd_tree[host])
|
||||
{
|
||||
host_per_osd[osd] = host;
|
||||
}
|
||||
}
|
||||
const parent_per_osd = build_parent_per_leaf(osd_tree);
|
||||
skip_pg: for (const pg_name in prev_weights)
|
||||
{
|
||||
const seen_hosts = {};
|
||||
const seen = [];
|
||||
const pg = pg_name.substr(3).split(/_/);
|
||||
for (const osd of pg)
|
||||
{
|
||||
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
|
||||
{
|
||||
if (!parent_per_osd[osd])
|
||||
continue skip_pg;
|
||||
for (let i = 0; i < parent_per_osd[osd].length; i++)
|
||||
{
|
||||
seen[parent_per_osd[osd][i]]++;
|
||||
if (seen[parent_per_osd[osd][i]] > hier_sizes[i])
|
||||
continue skip_pg;
|
||||
}
|
||||
seen_hosts[host_per_osd[osd]] = true;
|
||||
}
|
||||
if (!all_pgs[pg_name])
|
||||
{
|
||||
all_pgs[pg_name] = pg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to minimize data movement
|
||||
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
|
||||
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2,
|
||||
hier_sizes = null, max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
|
||||
{
|
||||
if (!osd_tree)
|
||||
{
|
||||
@@ -273,10 +280,10 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
|
||||
}
|
||||
const old_pg_size = prev_int_pgs[0].length;
|
||||
// Get all combinations
|
||||
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
|
||||
let all_pgs = random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout);
|
||||
if (old_pg_size == pg_size)
|
||||
{
|
||||
add_valid_previous(osd_tree, prev_weights, all_pgs);
|
||||
add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes || [ pg_size, 1 ]);
|
||||
}
|
||||
all_pgs = Object.values(all_pgs);
|
||||
const pg_per_osd = {};
|
||||
@@ -502,41 +509,147 @@ function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
|
||||
}
|
||||
}
|
||||
|
||||
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
|
||||
// Convert multi-level tree_node = { level: number|string, id?: string, size?: number, children?: tree_node[] }
|
||||
// levels = { string: number }
|
||||
// to a two-level osd_tree suitable for all_combinations()
|
||||
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
|
||||
// to a multi-level OSD tree suitable for random_hier_combinations()
|
||||
// (or in case of just 2 levels - for all_combinations() / random_combinations())
|
||||
//
|
||||
// Example:
|
||||
// tree_node = { level: 'dc', children: [ { level: 'rack', children: [ { level: 'host', children: [ { level: 'osd', size: 10 } ] } ] } ] }
|
||||
// extract_levels = [ 'rack', 'osd' ]
|
||||
// level_defs = { dc: 1, rack: 2, host: 3, osd: 4 }
|
||||
//
|
||||
// Result:
|
||||
// { rack0: { osd1: 10 } }
|
||||
function extract_tree_levels(tree_node, extract_levels, level_defs, new_tree = { idx: 1, items: {} })
|
||||
{
|
||||
osd_level = levels[osd_level] || osd_level;
|
||||
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
|
||||
for (const node of osd_tree)
|
||||
const next_level = Number(level_defs[extract_levels[0]] || extract_levels[0]) || 0;
|
||||
const level_name = level_defs[extract_levels[0]] ? extract_levels[0] : 'l'+extract_levels[0]+'_';
|
||||
const is_leaf = extract_levels.length == 1;
|
||||
if ((level_defs[tree_node.level] || tree_node.level) >= next_level)
|
||||
{
|
||||
if ((levels[node.level] || node.level) < failure_domain_level)
|
||||
if (!is_leaf)
|
||||
{
|
||||
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
|
||||
// Insert a (possibly fake) level
|
||||
const nt = { idx: 1, items: {} };
|
||||
new_tree.items[level_name+(new_tree.idx++)] = nt.items;
|
||||
extract_tree_levels(tree_node, extract_levels.slice(1), level_defs, nt);
|
||||
}
|
||||
else
|
||||
{
|
||||
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
|
||||
// Insert a leaf node
|
||||
const leaf_id = tree_node.id || (level_name+(new_tree.idx++));
|
||||
new_tree.items[leaf_id] = tree_node.size;
|
||||
}
|
||||
}
|
||||
return domains;
|
||||
else
|
||||
{
|
||||
for (const child_node of tree_node.children||[])
|
||||
{
|
||||
extract_tree_levels(child_node, extract_levels, level_defs, new_tree);
|
||||
}
|
||||
}
|
||||
return new_tree.items;
|
||||
}
|
||||
|
||||
function extract_osds(osd_tree, levels, osd_level, osds = {})
|
||||
// generate random PGs with hierarchical failure domains, i.e. for example 3 DC each with 2 HOSTS
|
||||
// osd_tree = { level3_id: { level2_id: { level1_id: scalar_value } }, ... }
|
||||
// osd_tree may contain arbitrary number of levels, but level count must be the same across the whole tree
|
||||
// size_per_level = number of items to select on each level, for example [3, 2, 1].
|
||||
// must have the same number of items as the osd_tree level count.
|
||||
// count = PG count to generate
|
||||
// ordered = don't treat (x,y) and (y,x) as equal
|
||||
// seq_layout = true for the [DC1,DC1,DC2,DC2,DC3,DC3] layout, false for [DC1,DC2,DC3,DC1,DC2,DC3] layout
|
||||
function random_hier_combinations(osd_tree, size_per_level, count, ordered, seq_layout)
|
||||
{
|
||||
for (const node of osd_tree)
|
||||
let seed = 0x5f020e43;
|
||||
const rng = () =>
|
||||
{
|
||||
if ((levels[node.level] || node.level) >= osd_level)
|
||||
seed ^= seed << 13;
|
||||
seed ^= seed >> 17;
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
};
|
||||
const get_max_level = (o) =>
|
||||
{
|
||||
let lvl = 0;
|
||||
while (o instanceof Object)
|
||||
{
|
||||
osds[node.id] = node.size;
|
||||
for (const k in o)
|
||||
{
|
||||
lvl++;
|
||||
o = o[k];
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
return lvl;
|
||||
};
|
||||
const max_level = get_max_level(osd_tree);
|
||||
const gen_pg = (select) =>
|
||||
{
|
||||
let pg = [ osd_tree ];
|
||||
for (let level = 0; level < max_level; level++)
|
||||
{
|
||||
extract_osds(node.children||[], levels, osd_level, osds);
|
||||
let npg = [];
|
||||
for (let i = 0; i < pg.length; i++)
|
||||
{
|
||||
const keys = pg[i] instanceof Object ? Object.keys(pg[i]) : [];
|
||||
const max_keys = keys.length < size_per_level[level] ? keys.length : size_per_level[level];
|
||||
for (let j = 0; j < max_keys; j++)
|
||||
{
|
||||
const r = select(level, i, j, (ordered ? keys.length : (keys.length - (max_keys - j - 1))));
|
||||
const el = pg[i][keys[r]] instanceof Object ? pg[i][keys[r]] : keys[r];
|
||||
npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = el;
|
||||
keys.splice(ordered ? r : 0, ordered ? 1 : (r+1));
|
||||
}
|
||||
for (let j = max_keys; j < size_per_level[level]; j++)
|
||||
npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = NO_OSD;
|
||||
}
|
||||
pg = npg;
|
||||
}
|
||||
return pg;
|
||||
};
|
||||
const r = {};
|
||||
// Generate random combinations including each OSD at least once
|
||||
let has_next = true;
|
||||
let ctr = [];
|
||||
while (has_next)
|
||||
{
|
||||
let pg = gen_pg((level, i, j, n) =>
|
||||
{
|
||||
if (i == 0 && j == 0)
|
||||
{
|
||||
// Select a pre-determined OSD in the first position on each level
|
||||
const r = ctr[level] == null || ctr[level][1] != n ? 0 : ctr[level][0];
|
||||
ctr[level] = [ r, n ];
|
||||
return r;
|
||||
}
|
||||
return rng() % n;
|
||||
});
|
||||
for (let i = ctr.length-1; i >= 0; i--)
|
||||
{
|
||||
ctr[i][0]++;
|
||||
if (ctr[i][0] < ctr[i][1])
|
||||
break;
|
||||
else
|
||||
ctr[i] = null;
|
||||
}
|
||||
has_next = ctr[0] != null;
|
||||
const cyclic_pgs = [ pg ];
|
||||
if (ordered)
|
||||
for (let i = 1; i < pg.size; i++)
|
||||
cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
|
||||
for (const pg of cyclic_pgs)
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
}
|
||||
return osds;
|
||||
// Generate purely random combinations
|
||||
while (count > 0)
|
||||
{
|
||||
let pg = gen_pg((l, i, j, n) => rng() % n);
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
count--;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// ordered = don't treat (x,y) and (y,x) as equal
|
||||
@@ -752,11 +865,12 @@ module.exports = {
|
||||
pg_weights_space_efficiency,
|
||||
pg_list_space_efficiency,
|
||||
pg_per_osd_space_efficiency,
|
||||
flatten_tree,
|
||||
extract_tree_levels,
|
||||
|
||||
lp_solve,
|
||||
make_int_pgs,
|
||||
align_pgs,
|
||||
random_combinations,
|
||||
random_hier_combinations,
|
||||
all_combinations,
|
||||
};
|
||||
|
81
mon/mon.js
81
mon/mon.js
@@ -104,21 +104,12 @@ const etcd_tree = {
|
||||
autosync_writes: 128,
|
||||
client_queue_depth: 128, // unused
|
||||
recovery_queue_depth: 4,
|
||||
recovery_pg_switch: 128,
|
||||
recovery_sync_batch: 16,
|
||||
no_recovery: false,
|
||||
no_rebalance: false,
|
||||
print_stats_interval: 3,
|
||||
slow_log_interval: 10,
|
||||
inode_vanish_time: 60,
|
||||
auto_scrub: false,
|
||||
no_scrub: false,
|
||||
scrub_interval: '30d', // 1s/1m/1h/1d
|
||||
scrub_queue_depth: 1,
|
||||
scrub_sleep: 0, // milliseconds
|
||||
scrub_list_limit: 1000, // objects to list on one scrub iteration
|
||||
scrub_find_best: true,
|
||||
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
|
||||
// blockstore - fixed in superblock
|
||||
block_size,
|
||||
disk_alignment,
|
||||
@@ -168,6 +159,10 @@ const etcd_tree = {
|
||||
// number of parity chunks, required for EC
|
||||
parity_chunks?: 1,
|
||||
pg_count: 100,
|
||||
// failure_domain = string | { string: int }
|
||||
// the second case specifies multiple failure domains. example:
|
||||
// { datacenter: 3, host: 2 } - means 3 datacenters with 2 hosts each, for EC 4+2
|
||||
// guarantees availability on outage of either 1 datacenter or 2 hosts
|
||||
failure_domain: 'host',
|
||||
max_osd_combinations: 10000,
|
||||
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
|
||||
@@ -181,8 +176,6 @@ const etcd_tree = {
|
||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// prefer to put primary on OSD with these tags
|
||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// scrub interval
|
||||
scrub_interval?: '30d',
|
||||
},
|
||||
...
|
||||
}, */
|
||||
@@ -278,7 +271,7 @@ const etcd_tree = {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
|
||||
"has_invalid"|"left_on_dead")[],
|
||||
}
|
||||
}, */
|
||||
},
|
||||
@@ -300,7 +293,6 @@ const etcd_tree = {
|
||||
osd_sets: osd_num_t[][],
|
||||
all_peers: osd_num_t[],
|
||||
epoch: uint64_t,
|
||||
next_scrub: uint64_t,
|
||||
},
|
||||
}, */
|
||||
},
|
||||
@@ -1039,6 +1031,32 @@ class Mon
|
||||
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
|
||||
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
|
||||
pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
|
||||
if (pool_cfg.failure_domain instanceof Object)
|
||||
{
|
||||
for (const key in pool_cfg.failure_domain)
|
||||
{
|
||||
const cnt = parseInt(pool_cfg.failure_domain[key]);
|
||||
if (!cnt || cnt <= 0)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' specifies invalid item count for failure domain \"'+key+'\"');
|
||||
return false;
|
||||
}
|
||||
if (key !== 'host' && key != 'osd' && !(key in this.config.placement_levels||{}))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' uses invalid failure domain \"'+key+'\"');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (pool_cfg.failure_domain !== 'host' && pool_cfg.failure_domain != 'osd' &&
|
||||
!(pool_cfg.failure_domain in this.config.placement_levels||{}))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' uses invalid failure domain \"'+pool_cfg.failure_domain+'\"');
|
||||
return false;
|
||||
}
|
||||
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
|
||||
if (!/^[1-9]\d*$/.exec(''+pool_id))
|
||||
{
|
||||
@@ -1124,27 +1142,23 @@ class Mon
|
||||
filter_osds_by_tags(orig_tree, flat_tree, tags)
|
||||
{
|
||||
if (!tags)
|
||||
{
|
||||
return;
|
||||
}
|
||||
return 1;
|
||||
for (const tag of (tags instanceof Array ? tags : [ tags ]))
|
||||
{
|
||||
for (const host in flat_tree)
|
||||
for (const item in flat_tree)
|
||||
{
|
||||
let found = 0;
|
||||
for (const osd in flat_tree[host])
|
||||
if (flat_tree[item] instanceof Object)
|
||||
{
|
||||
if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
|
||||
delete flat_tree[host][osd];
|
||||
else
|
||||
found++;
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
delete flat_tree[host];
|
||||
if (!filter_osds_by_tags(orig_tree, flat_tree[item], tags))
|
||||
delete flat_tree[item];
|
||||
}
|
||||
else if (!orig_tree[item].tags || !orig_tree[item].tags[tag])
|
||||
delete flat_tree[item];
|
||||
}
|
||||
}
|
||||
for (const item in flat_tree)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
get_affinity_osds(pool_cfg, up_osds, osd_tree)
|
||||
@@ -1203,9 +1217,11 @@ class Mon
|
||||
{
|
||||
continue;
|
||||
}
|
||||
let pool_tree = osd_tree[pool_cfg.root_node || ''];
|
||||
pool_tree = pool_tree ? pool_tree.children : [];
|
||||
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
|
||||
let pool_tree = osd_tree[pool_cfg.root_node || ''] || {};
|
||||
const failure_domains = pool_cfg.failure_domain instanceof Object
|
||||
? [ ...Object.keys(pool_cfg.failure_domain), 'osd' ]
|
||||
: [ pool_cfg.failure_domain, 'osd' ];
|
||||
pool_tree = LPOptimizer.extract_tree_levels(pool_tree, failure_domains, levels);
|
||||
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
|
||||
// These are for the purpose of building history.osd_sets
|
||||
const real_prev_pgs = [];
|
||||
@@ -1232,6 +1248,9 @@ class Mon
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
hier_sizes: pool_cfg.failure_domain instanceof Object
|
||||
? [ ...Object.values(pool_cfg.failure_domain), 1 ]
|
||||
: null,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
ordered: pool_cfg.scheme != 'replicated',
|
||||
};
|
||||
@@ -1287,7 +1306,7 @@ class Mon
|
||||
} });
|
||||
}
|
||||
LPOptimizer.print_change_stats(optimize_result);
|
||||
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
|
||||
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length); // FIXME requires hier support too
|
||||
this.state.pool.stats[pool_id] = {
|
||||
used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
|
||||
total_raw_tb: optimize_result.space,
|
||||
|
@@ -36,7 +36,7 @@ const crush_tree = [
|
||||
] },
|
||||
];
|
||||
|
||||
const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
|
||||
const osd_tree = LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {});
|
||||
console.log(osd_tree);
|
||||
|
||||
async function run()
|
||||
@@ -47,32 +47,32 @@ async function run()
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 0);
|
||||
console.log('\nAdding 1st failure domain:');
|
||||
cur_tree['dom1'] = osd_tree['dom1'];
|
||||
cur_tree['l1_1'] = osd_tree['l1_1'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 12 && res.total_space == 12);
|
||||
console.log('\nAdding 2nd failure domain:');
|
||||
cur_tree['dom2'] = osd_tree['dom2'];
|
||||
cur_tree['l1_2'] = osd_tree['l1_2'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 24 && res.total_space == 24);
|
||||
console.log('\nAdding 3rd failure domain:');
|
||||
cur_tree['dom3'] = osd_tree['dom3'];
|
||||
cur_tree['l1_3'] = osd_tree['l1_3'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 36 && res.total_space == 36);
|
||||
console.log('\nRemoving 3rd failure domain:');
|
||||
delete cur_tree['dom3'];
|
||||
delete cur_tree['l1_3'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 24 && res.total_space == 24);
|
||||
console.log('\nRemoving 2nd failure domain:');
|
||||
delete cur_tree['dom2'];
|
||||
delete cur_tree['l1_2'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 12 && res.total_space == 12);
|
||||
console.log('\nRemoving 1st failure domain:');
|
||||
delete cur_tree['dom1'];
|
||||
delete cur_tree['l1_1'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 0);
|
||||
|
@@ -108,7 +108,11 @@ async function run()
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
|
||||
console.log('\n256 PGs, size=3, failure domain=rack');
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 });
|
||||
res = await LPOptimizer.optimize_initial({
|
||||
osd_tree: LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {}),
|
||||
pg_size: 3,
|
||||
pg_count: 256,
|
||||
});
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
}
|
||||
|
||||
|
56
mon/test-random-hier.js
Normal file
56
mon/test-random-hier.js
Normal file
@@ -0,0 +1,56 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
100: { 110: { 111: 1, 112: 1 }, 120: { 121: 1, 122: 1 } },
|
||||
200: { 210: { 211: 1, 212: 1 }, 220: { 221: 1, 222: 1 } },
|
||||
300: { 310: { 311: 1, 312: 1 }, 320: { 321: 1, 322: 1 } },
|
||||
400: { 410: { 411: 1, 412: 1 }, 420: { 421: 1, 422: 1 } },
|
||||
500: { 510: { 511: 1, 512: 1 }, 520: { 521: 1, 522: 1 } },
|
||||
};
|
||||
|
||||
const osd_tree2 = {
|
||||
100: { 111: 1, 112: 1, 121: 1, 122: 1 },
|
||||
200: { 211: 1, 212: 1, 221: 1, 222: 1 },
|
||||
300: { 311: 1, 312: 1, 321: 1, 322: 1 },
|
||||
400: { 411: 1, 412: 1, 421: 1, 422: 1 },
|
||||
500: { 511: 1, 512: 1, 521: 1, 522: 1 },
|
||||
};
|
||||
|
||||
const osd_tree3 = {
|
||||
100: { 111: 1, 112: 1, 121: 1, 122: 1 },
|
||||
200: { 211: 1, 212: 1, 221: 1, 222: 1 },
|
||||
300: { 311: 1, 312: 1, 321: 1, 322: 1 },
|
||||
400: { 411: 1, 412: 1, 421: 1, 422: 1 },
|
||||
500: { 511: 1 },
|
||||
};
|
||||
|
||||
async function run()
|
||||
{
|
||||
let r;
|
||||
console.log(r = LPOptimizer.random_hier_combinations(osd_tree, [ 3, 2, 1 ], 10000, false, true));
|
||||
console.log(r = LPOptimizer.random_hier_combinations(osd_tree2, [ 3, 2 ], 0, false, true));
|
||||
// Will contain 'Z':
|
||||
console.log(r = LPOptimizer.random_combinations(osd_tree2, 6, 0, true));
|
||||
console.log(r = LPOptimizer.extract_tree_levels(
|
||||
{ level: 'dc', children: [
|
||||
{ level: 'rack', children: [
|
||||
{ level: 'host', children: [
|
||||
{ level: 'osd', id: 'OSD5', size: 10 },
|
||||
] },
|
||||
] },
|
||||
{ level: 'osd', id: 'OSD10', size: 10 },
|
||||
] },
|
||||
[ 'rack', 'osd' ],
|
||||
{ dc: 1, rack: 2, host: 3, osd: 4 }
|
||||
));
|
||||
if (JSON.stringify(r) != '{"rack1":{"OSD5":10},"rack2":{"OSD10":10}}')
|
||||
throw new Error('extract_tree_levels failed');
|
||||
// should not contain Z:
|
||||
console.log(r = LPOptimizer.random_hier_combinations(osd_tree3, [ 3, 2 ], 0, false, true));
|
||||
console.log('OK');
|
||||
}
|
||||
|
||||
run().catch(console.error);
|
@@ -388,6 +388,8 @@ sub unmap_volume
|
||||
my ($class, $storeid, $scfg, $volname, $snapname) = @_;
|
||||
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
|
||||
|
||||
return 1 if !$scfg->{vitastor_nbd};
|
||||
|
||||
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
|
||||
$name .= '@'.$snapname if $snapname;
|
||||
|
||||
@@ -411,7 +413,7 @@ sub activate_volume
|
||||
sub deactivate_volume
|
||||
{
|
||||
my ($class, $storeid, $scfg, $volname, $snapname, $cache) = @_;
|
||||
$class->unmap_volume($storeid, $scfg, $volname, $snapname) if $scfg->{vitastor_nbd};
|
||||
$class->unmap_volume($storeid, $scfg, $volname, $snapname);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.9.1'
|
||||
VERSION = '0.8.9'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -24,4 +24,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.9.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.1$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.8.9/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.9$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.9.1.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.9.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.9.1
|
||||
Version: 0.8.9
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.9.1.el7.tar.gz
|
||||
Source0: vitastor-0.8.9.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.9.1.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.9.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.9.1
|
||||
Version: 0.8.9
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.9.1.el8.tar.gz
|
||||
Source0: vitastor-0.8.9.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -18,7 +18,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.9.1.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.9.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.9.1
|
||||
Version: 0.8.9
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.9.1.el9.tar.gz
|
||||
Source0: vitastor-0.8.9.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -73,7 +73,7 @@ Vitastor library headers for development.
|
||||
Summary: Vitastor - fio drivers
|
||||
Group: Development/Libraries
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: fio = 3.27-8.el9
|
||||
Requires: fio = 3.27-7.el9
|
||||
|
||||
|
||||
%description -n vitastor-fio
|
||||
|
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.9.1")
|
||||
add_definitions(-DVERSION="0.8.9")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||
add_executable(vitastor-osd
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_primary_describe.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-osd
|
||||
vitastor_common
|
||||
@@ -141,8 +141,6 @@ add_library(vitastor_client SHARED
|
||||
cli_common.cpp
|
||||
cli_alloc_osd.cpp
|
||||
cli_status.cpp
|
||||
cli_describe.cpp
|
||||
cli_fix.cpp
|
||||
cli_df.cpp
|
||||
cli_ls.cpp
|
||||
cli_create.cpp
|
||||
@@ -301,7 +299,7 @@ add_executable(test_cluster_client
|
||||
EXCLUDE_FROM_ALL
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||
|
@@ -73,10 +73,7 @@ Input:
|
||||
write request is copied into the metadata area bitwise and stored there.
|
||||
|
||||
Output:
|
||||
- retval = number of bytes actually read/written or negative error number
|
||||
-EINVAL = invalid input parameters
|
||||
-ENOENT = requested object/version does not exist for reads
|
||||
-ENOSPC = no space left in the store for writes
|
||||
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
|
||||
- version = the version actually read or written
|
||||
|
||||
## BS_OP_DELETE
|
||||
@@ -125,14 +122,11 @@ Output:
|
||||
Get a list of all objects in this Blockstore.
|
||||
|
||||
Input:
|
||||
- pg_alignment = PG alignment
|
||||
- pg_count = PG count or 0 to list all objects
|
||||
- pg_number = PG number
|
||||
- list_stable_limit = max number of clean objects in the reply
|
||||
it's guaranteed that dirty objects are returned from the same interval,
|
||||
i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
|
||||
- min_oid = min inode/stripe or 0 to list all objects
|
||||
- max_oid = max inode/stripe or 0 to list all objects
|
||||
- oid.stripe = PG alignment
|
||||
- len = PG count or 0 to list all objects
|
||||
- offset = PG number
|
||||
- oid.inode = min inode number or 0 to list all inodes
|
||||
- version = max inode number or 0 to list all inodes
|
||||
|
||||
Output:
|
||||
- retval = total obj_ver_id count
|
||||
@@ -149,27 +143,10 @@ struct blockstore_op_t
|
||||
uint64_t opcode;
|
||||
// finish callback
|
||||
std::function<void (blockstore_op_t*)> callback;
|
||||
union __attribute__((__packed__))
|
||||
{
|
||||
// R/W
|
||||
struct __attribute__((__packed__))
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
};
|
||||
// List
|
||||
struct __attribute__((__packed__))
|
||||
{
|
||||
object_id min_oid;
|
||||
object_id max_oid;
|
||||
uint32_t pg_alignment;
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_number;
|
||||
uint32_t list_stable_limit;
|
||||
};
|
||||
};
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
void *buf;
|
||||
void *bitmap;
|
||||
int retval;
|
||||
|
@@ -536,27 +536,14 @@ resume_1:
|
||||
return false;
|
||||
}
|
||||
// zero out old metadata entry
|
||||
{
|
||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
|
||||
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
||||
{
|
||||
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %lu (%lx:%lx v%lu) as old location of %lx:%lx\n",
|
||||
old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
|
||||
old_entry->version, cur.oid.inode, cur.oid.stripe);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||
if (meta_old.sector != meta_new.sector)
|
||||
{
|
||||
await_sqe(15);
|
||||
data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
|
||||
);
|
||||
wait_count++;
|
||||
}
|
||||
await_sqe(15);
|
||||
data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
|
||||
);
|
||||
wait_count++;
|
||||
}
|
||||
if (has_delete)
|
||||
{
|
||||
|
@@ -462,11 +462,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->pg_number+1;
|
||||
uint32_t pg_count = op->pg_count;
|
||||
uint64_t pg_stripe_size = op->pg_alignment;
|
||||
uint64_t min_inode = op->min_oid.inode;
|
||||
uint64_t max_inode = op->max_oid.inode;
|
||||
uint32_t list_pg = op->offset+1;
|
||||
uint32_t pg_count = op->len;
|
||||
uint64_t pg_stripe_size = op->oid.stripe;
|
||||
uint64_t min_inode = op->oid.inode;
|
||||
uint64_t max_inode = op->version;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
|
||||
{
|
||||
@@ -513,13 +513,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
stable_alloc = op->list_stable_limit;
|
||||
if (stable_alloc > 1024*1024)
|
||||
stable_alloc = 1024*1024;
|
||||
}
|
||||
if (stable_alloc < 32768)
|
||||
else
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
@@ -530,22 +524,22 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
auto max_oid = op->max_oid;
|
||||
bool limited = false;
|
||||
pool_pg_id_t last_shard_id = 0;
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
{
|
||||
clean_it = clean_db.lower_bound(op->min_oid);
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
clean_end = clean_db.upper_bound(max_oid);
|
||||
clean_it = clean_db.lower_bound({
|
||||
.inode = min_inode,
|
||||
.stripe = 0,
|
||||
});
|
||||
clean_end = clean_db.upper_bound({
|
||||
.inode = max_inode,
|
||||
.stripe = UINT64_MAX,
|
||||
});
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
@@ -564,29 +558,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
if (!limited)
|
||||
{
|
||||
limited = true;
|
||||
max_oid = stable[stable_count-1].oid;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
// To maintain the order, we have to include objects in the same range from other shards
|
||||
if (last_shard_id != 0 && last_shard_id != shard_it->first)
|
||||
std::sort(stable, stable+stable_count);
|
||||
if (stable_count > op->list_stable_limit)
|
||||
stable_count = op->list_stable_limit;
|
||||
}
|
||||
last_shard_id = shard_it->first;
|
||||
}
|
||||
if (op->list_stable_limit == 0 && first_shard != last_shard)
|
||||
if (first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
|
||||
// If that's not a per-PG listing, sort clean entries
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
@@ -595,17 +571,20 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
obj_ver_id *unstable = NULL;
|
||||
{
|
||||
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
{
|
||||
dirty_it = dirty_db.lower_bound({
|
||||
.oid = op->min_oid,
|
||||
.oid = {
|
||||
.inode = min_inode,
|
||||
.stripe = 0,
|
||||
},
|
||||
.version = 0,
|
||||
});
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
dirty_end = dirty_db.upper_bound({
|
||||
.oid = max_oid,
|
||||
.oid = {
|
||||
.inode = max_inode,
|
||||
.stripe = UINT64_MAX,
|
||||
},
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
}
|
||||
@@ -649,11 +628,6 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
stable[stable_count++] = dirty_it->first;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
// Stop here
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@@ -124,8 +124,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
|
||||
if (!clean_found && !dirty_found)
|
||||
{
|
||||
// region is not allocated - return zeroes
|
||||
memset(read_op->buf, 0, read_op->len);
|
||||
read_op->version = 0;
|
||||
read_op->retval = -ENOENT;
|
||||
read_op->retval = read_op->len;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
@@ -138,16 +140,14 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
dirty_entry& dirty = dirty_it->second;
|
||||
bool version_ok = !IS_IN_FLIGHT(dirty.state) && read_op->version >= dirty_it->first.version;
|
||||
if (IS_SYNCED(dirty.state))
|
||||
{
|
||||
if (!version_ok && read_op->version != 0)
|
||||
read_op->version = dirty_it->first.version;
|
||||
version_ok = true;
|
||||
}
|
||||
if (version_ok)
|
||||
{
|
||||
if (IS_DELETE(dirty.state))
|
||||
{
|
||||
assert(!result_version);
|
||||
read_op->version = 0;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
if (!result_version)
|
||||
{
|
||||
result_version = dirty_it->first.version;
|
||||
@@ -234,19 +234,12 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!result_version)
|
||||
{
|
||||
// May happen if there are entries in dirty_db but all of them are !version_ok
|
||||
read_op->version = 0;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
if (fulfilled < read_op->len)
|
||||
else if (fulfilled < read_op->len)
|
||||
{
|
||||
// fill remaining parts with zeroes
|
||||
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
|
||||
assert(fulfilled == read_op->len);
|
||||
}
|
||||
assert(fulfilled == read_op->len);
|
||||
read_op->version = result_version;
|
||||
if (!PRIV(read_op)->pending_ops)
|
||||
{
|
||||
|
@@ -179,7 +179,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
{
|
||||
object_id oid = dirty_it->first.oid;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Unblock writes-after-delete %lx:%lx v%lu\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
dirty_it = dirty_end;
|
||||
// Unblock operations blocked by delete flushing
|
||||
|
@@ -103,7 +103,7 @@ blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->buf = NULL;
|
||||
sync_op->callback = [](blockstore_op_t *sync_op)
|
||||
sync_op->callback = [this](blockstore_op_t *sync_op)
|
||||
{
|
||||
delete sync_op;
|
||||
};
|
||||
@@ -244,7 +244,7 @@ int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_
|
||||
// Make a wrapped callback
|
||||
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
|
||||
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
|
||||
auto cb = [op, good_items = good_vers.items,
|
||||
auto cb = [this, op, good_items = good_vers.items,
|
||||
bad_items = bad_vers.items, split_op_counter,
|
||||
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
|
||||
{
|
||||
|
@@ -6,7 +6,7 @@
|
||||
bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
{
|
||||
// Check or assign version number
|
||||
bool found = false, deleted = false, unsynced = false, is_del = (op->opcode == BS_OP_DELETE);
|
||||
bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
|
||||
bool wait_big = false, wait_del = false;
|
||||
void *bmp = NULL;
|
||||
uint64_t version = 1;
|
||||
@@ -26,7 +26,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
found = true;
|
||||
version = dirty_it->first.version + 1;
|
||||
deleted = IS_DELETE(dirty_it->second.state);
|
||||
unsynced = !IS_SYNCED(dirty_it->second.state);
|
||||
wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
|
||||
wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
|
||||
? !IS_SYNCED(dirty_it->second.state)
|
||||
@@ -82,28 +81,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
wait_del = true;
|
||||
PRIV(op)->real_version = op->version;
|
||||
op->version = version;
|
||||
if (unsynced)
|
||||
{
|
||||
// Issue an additional sync so the delete reaches the journal
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->callback = [this, op](blockstore_op_t *sync_op)
|
||||
{
|
||||
flusher->unshift_flush((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version-1,
|
||||
}, true);
|
||||
delete sync_op;
|
||||
};
|
||||
enqueue_op(sync_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
flusher->unshift_flush((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = version-1,
|
||||
}, true);
|
||||
}
|
||||
flusher->unshift_flush((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = version-1,
|
||||
}, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
42
src/cli.cpp
42
src/cli.cpp
@@ -73,37 +73,6 @@ static const char* help_text =
|
||||
" <to> must be a child of <from> and <target> may be one of the layers between\n"
|
||||
" <from> and <to>, including <from> and <to>.\n"
|
||||
"\n"
|
||||
"vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>] [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>] [--min-offset <offset>] [--max-offset <offset>]\n"
|
||||
" Describe unclean object locations in the cluster.\n"
|
||||
" --osds <osds>\n"
|
||||
" Only list objects from primary OSD(s) <osds>.\n"
|
||||
" --object-state <states>\n"
|
||||
" Only list objects in given state(s). State(s) may include:\n"
|
||||
" degraded, misplaced, incomplete, corrupted, inconsistent.\n"
|
||||
" --pool <pool name or number>\n"
|
||||
" Only list objects in the given pool.\n"
|
||||
" --inode, --min-inode, --max-inode\n"
|
||||
" Restrict listing to specific inode numbers.\n"
|
||||
" --min-offset, --max-offset\n"
|
||||
" Restrict listing to specific offsets inside inodes.\n"
|
||||
"\n"
|
||||
"vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]\n"
|
||||
" Fix inconsistent objects in the cluster by deleting some copies.\n"
|
||||
" --objects <objects>\n"
|
||||
" Objects to fix, either in plain text or JSON format. If not specified,\n"
|
||||
" object list will be read from STDIN in one of the same formats.\n"
|
||||
" Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...\n"
|
||||
" JSON format: [{\"inode\":\"0x...\",\"stripe\":\"0x...\"},...]\n"
|
||||
" --bad-osds <osds>\n"
|
||||
" Remove inconsistent copies/parts of objects from these OSDs, effectively\n"
|
||||
" marking them bad and allowing Vitastor to recover objects from other copies.\n"
|
||||
" --part <number>\n"
|
||||
" Only remove EC part <number> (from 0 to pg_size-1), required for extreme\n"
|
||||
" edge cases where one OSD has multiple parts of a EC object.\n"
|
||||
" --check no\n"
|
||||
" Do not recheck that requested objects are actually inconsistent,\n"
|
||||
" delete requested copies/parts anyway.\n"
|
||||
"\n"
|
||||
"vitastor-cli alloc-osd\n"
|
||||
" Allocate a new OSD number and reserve it by creating empty /osd/stats/<n> key.\n"
|
||||
"\n"
|
||||
@@ -199,7 +168,6 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||
static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
{
|
||||
cli_result_t result = {};
|
||||
p->is_command_line = true;
|
||||
p->parse_config(cfg);
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
cfg.erase("command");
|
||||
@@ -308,16 +276,6 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
}
|
||||
action_cb = p->start_rm(cfg);
|
||||
}
|
||||
else if (cmd[0] == "describe")
|
||||
{
|
||||
// Describe unclean objects
|
||||
action_cb = p->start_describe(cfg);
|
||||
}
|
||||
else if (cmd[0] == "fix")
|
||||
{
|
||||
// Fix inconsistent objects (by deleting some copies)
|
||||
action_cb = p->start_fix(cfg);
|
||||
}
|
||||
else if (cmd[0] == "alloc-osd")
|
||||
{
|
||||
// Allocate a new OSD number
|
||||
|
@@ -34,12 +34,12 @@ public:
|
||||
bool list_first = false;
|
||||
bool json_output = false;
|
||||
int log_level = 0;
|
||||
bool is_command_line = false;
|
||||
bool color = false;
|
||||
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
bool no_recovery = false, no_rebalance = false, readonly = false;
|
||||
|
||||
int waiting = 0;
|
||||
cli_result_t etcd_err;
|
||||
@@ -56,8 +56,6 @@ public:
|
||||
friend struct snap_remover_t;
|
||||
|
||||
std::function<bool(cli_result_t &)> start_status(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_describe(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_fix(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_df(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_create(json11::Json);
|
||||
|
@@ -1,256 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli_fix.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
#include "str_util.h"
|
||||
|
||||
std::vector<uint64_t> parse_uint64_list(json11::Json val)
|
||||
{
|
||||
std::vector<uint64_t> ret;
|
||||
if (val.is_number())
|
||||
ret.push_back(val.uint64_value());
|
||||
else if (val.is_string())
|
||||
{
|
||||
const std::string & s = val.string_value();
|
||||
for (int i = 0, p = -1; i <= s.size(); i++)
|
||||
{
|
||||
if (p < 0 && i < s.size() && (isdigit(s[i]) || s[i] == 'x'))
|
||||
p = i;
|
||||
else if (p >= 0 && (i >= s.size() || !isdigit(s[i]) && s[i] != 'x'))
|
||||
{
|
||||
ret.push_back(stoull_full(s.substr(p, i-p), 0));
|
||||
p = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (val.is_array())
|
||||
{
|
||||
for (auto & pg_num: val.array_items())
|
||||
ret.push_back(pg_num.uint64_value());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct cli_describe_t
|
||||
{
|
||||
uint64_t object_state = 0;
|
||||
pool_id_t only_pool = 0;
|
||||
std::vector<uint64_t> only_osds;
|
||||
uint64_t min_inode = 0, max_inode = 0;
|
||||
uint64_t min_offset = 0, max_offset = 0;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
int state = 0;
|
||||
int count = 0;
|
||||
|
||||
json11::Json options;
|
||||
cli_result_t result;
|
||||
json11::Json::array describe_items;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void parse_options(json11::Json cfg)
|
||||
{
|
||||
only_pool = cfg["pool"].uint64_value();
|
||||
if (!only_pool && cfg["pool"].is_string())
|
||||
{
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
if (pp.second.name == cfg["pool"].string_value())
|
||||
{
|
||||
only_pool = pp.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
min_inode = cfg["inode"].uint64_value();
|
||||
if (min_inode)
|
||||
{
|
||||
if (!INODE_POOL(min_inode))
|
||||
min_inode |= (uint64_t)only_pool << (64-POOL_ID_BITS);
|
||||
max_inode = min_inode;
|
||||
min_offset = max_offset = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
min_inode = stoull_full(cfg["min_inode"].string_value(), 0); // to support 0x...
|
||||
max_inode = stoull_full(cfg["max_inode"].string_value(), 0);
|
||||
min_offset = stoull_full(cfg["min_offset"].string_value(), 0);
|
||||
max_offset = stoull_full(cfg["max_offset"].string_value(), 0);
|
||||
if (!min_inode && !max_inode && only_pool)
|
||||
{
|
||||
min_inode = (uint64_t)only_pool << (64-POOL_ID_BITS);
|
||||
max_inode = ((uint64_t)only_pool << (64-POOL_ID_BITS)) |
|
||||
(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
}
|
||||
}
|
||||
only_osds = parse_uint64_list(cfg["osds"]);
|
||||
object_state = stoull_full(cfg["object_state"].string_value(), 0);
|
||||
if (!object_state && cfg["object_state"].is_string())
|
||||
{
|
||||
if (cfg["object_state"].string_value().find("inconsistent") != std::string::npos)
|
||||
object_state |= OBJ_INCONSISTENT;
|
||||
if (cfg["object_state"].string_value().find("corrupted") != std::string::npos)
|
||||
object_state |= OBJ_CORRUPTED;
|
||||
if (cfg["object_state"].string_value().find("incomplete") != std::string::npos)
|
||||
object_state |= OBJ_INCOMPLETE;
|
||||
if (cfg["object_state"].string_value().find("degraded") != std::string::npos)
|
||||
object_state |= OBJ_DEGRADED;
|
||||
if (cfg["object_state"].string_value().find("misplaced") != std::string::npos)
|
||||
object_state |= OBJ_MISPLACED;
|
||||
}
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
if (state == 100)
|
||||
return;
|
||||
parse_options(options);
|
||||
if (min_inode && !INODE_POOL(min_inode))
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!only_osds.size())
|
||||
{
|
||||
uint64_t min_pool = min_inode >> (64-POOL_ID_BITS);
|
||||
uint64_t max_pool = max_inode >> (64-POOL_ID_BITS);
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
if (pp.first >= min_pool && (!max_pool || pp.first <= max_pool))
|
||||
{
|
||||
for (auto & pgp: pp.second.pg_config)
|
||||
only_osds.push_back(pgp.second.cur_primary);
|
||||
}
|
||||
}
|
||||
}
|
||||
remove_duplicates(only_osds);
|
||||
parent->cli->init_msgr();
|
||||
if (parent->json_output && parent->is_command_line)
|
||||
{
|
||||
printf("[\n");
|
||||
}
|
||||
for (int i = 0; i < only_osds.size(); i++)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t;
|
||||
op->req = (osd_any_op_t){
|
||||
.describe = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DESCRIBE,
|
||||
},
|
||||
.object_state = object_state,
|
||||
.min_inode = min_inode,
|
||||
.min_offset = min_offset,
|
||||
.max_inode = max_inode,
|
||||
.max_offset = max_offset,
|
||||
},
|
||||
};
|
||||
op->callback = [this, osd_num = only_osds[i]](osd_op_t *op)
|
||||
{
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n",
|
||||
osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Invalid response size from OSD %lu (expected %lu bytes, got %lu bytes)\n",
|
||||
osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
|
||||
for (int i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
if (!parent->json_output || parent->is_command_line)
|
||||
{
|
||||
#define FMT "{\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"part\":%u,\"osd_num\":%lu%s%s%s}"
|
||||
printf(
|
||||
(parent->json_output
|
||||
? (count > 0 ? ",\n " FMT : " " FMT)
|
||||
: "%lx:%lx part %u on OSD %lu%s%s%s\n"),
|
||||
#undef FMT
|
||||
items[i].inode, items[i].stripe,
|
||||
items[i].role, items[i].osd_num,
|
||||
(items[i].loc_bad & LOC_CORRUPTED ? (parent->json_output ? ",\"corrupted\":true" : " corrupted") : ""),
|
||||
(items[i].loc_bad & LOC_INCONSISTENT ? (parent->json_output ? ",\"inconsistent\":true" : " inconsistent") : ""),
|
||||
(items[i].loc_bad & LOC_OUTDATED ? (parent->json_output ? ",\"outdated\":true" : " outdated") : "")
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto json_item = json11::Json::object {
|
||||
{ "inode", (uint64_t)items[i].inode },
|
||||
{ "stripe", (uint64_t)items[i].stripe },
|
||||
{ "part", (uint64_t)items[i].role },
|
||||
{ "osd_num", (uint64_t)items[i].osd_num },
|
||||
};
|
||||
if (items[i].loc_bad & LOC_CORRUPTED)
|
||||
json_item["corrupted"] = true;
|
||||
if (items[i].loc_bad & LOC_INCONSISTENT)
|
||||
json_item["inconsistent"] = true;
|
||||
if (items[i].loc_bad & LOC_OUTDATED)
|
||||
json_item["outdated"] = true;
|
||||
describe_items.push_back(json_item);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
parent->waiting--;
|
||||
if (!parent->waiting)
|
||||
loop();
|
||||
};
|
||||
parent->waiting++;
|
||||
parent->cli->execute_raw(only_osds[i], op);
|
||||
}
|
||||
resume_1:
|
||||
state = 1;
|
||||
if (parent->waiting > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (parent->json_output && parent->is_command_line)
|
||||
{
|
||||
printf(count > 0 ? "\n]\n" : "]\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
result.data = describe_items;
|
||||
}
|
||||
state = 100;
|
||||
describe_items.clear();
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_describe(json11::Json cfg)
|
||||
{
|
||||
auto describer = new cli_describe_t();
|
||||
describer->parent = this;
|
||||
describer->options = cfg;
|
||||
return [describer](cli_result_t & result)
|
||||
{
|
||||
describer->loop();
|
||||
if (describer->is_done())
|
||||
{
|
||||
result = describer->result;
|
||||
delete describer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
313
src/cli_fix.cpp
313
src/cli_fix.cpp
@@ -1,313 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli_fix.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
#include "str_util.h"
|
||||
|
||||
struct cli_fix_t
|
||||
{
|
||||
std::vector<object_id> objects;
|
||||
int part = -1;
|
||||
int processed_count = 0;
|
||||
std::set<osd_num_t> bad_osds;
|
||||
bool no_check = false;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
int state = 0;
|
||||
|
||||
json11::Json options;
|
||||
cli_result_t result;
|
||||
json11::Json::array fix_result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void parse_objects_str(std::string str)
|
||||
{
|
||||
str = trim(str);
|
||||
if (str[0] == '[')
|
||||
{
|
||||
std::string json_err;
|
||||
json11::Json list = json11::Json::parse(str, json_err);
|
||||
if (json_err != "")
|
||||
fprintf(stderr, "Invalid JSON object list input: %s\n", json_err.c_str());
|
||||
else
|
||||
parse_object_list(list);
|
||||
}
|
||||
else
|
||||
{
|
||||
const char *s = str.c_str();
|
||||
char *e = NULL;
|
||||
int len = str.size();
|
||||
object_id oid;
|
||||
for (int p = 0; p < len; p++)
|
||||
{
|
||||
if (isdigit(s[p]))
|
||||
{
|
||||
int p0 = p;
|
||||
oid.inode = strtoull(s+p, &e, 0);
|
||||
p = e-s;
|
||||
while (p < len && !isdigit(s[p]) && s[p] != ':')
|
||||
p++;
|
||||
if (s[p] != ':')
|
||||
{
|
||||
fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
|
||||
continue;
|
||||
}
|
||||
p++;
|
||||
while (p < len && !isdigit(s[p]))
|
||||
p++;
|
||||
oid.stripe = strtoull(s+p, &e, 0) & ~STRIPE_MASK;
|
||||
p = e-s;
|
||||
if (oid.inode)
|
||||
objects.push_back(oid);
|
||||
else
|
||||
fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void parse_object_list(json11::Json list)
|
||||
{
|
||||
for (auto & obj: list.array_items())
|
||||
{
|
||||
object_id oid = (object_id){
|
||||
.inode = stoull_full(obj["inode"].string_value(), 0),
|
||||
.stripe = stoull_full(obj["stripe"].string_value(), 0) & ~STRIPE_MASK,
|
||||
};
|
||||
if (oid.inode)
|
||||
objects.push_back(oid);
|
||||
else
|
||||
fprintf(stderr, "Invalid JSON object ID in input: %s, bad or missing \"inode\" field\n", obj.dump().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void parse_options(json11::Json cfg)
|
||||
{
|
||||
json11::Json object_list;
|
||||
if (cfg["objects"].is_null())
|
||||
parse_objects_str(read_all_fd(0));
|
||||
else if (cfg["objects"].is_string())
|
||||
parse_objects_str(cfg["objects"].string_value());
|
||||
else
|
||||
parse_object_list(cfg["objects"].array_items());
|
||||
for (auto osd_num: parse_uint64_list(cfg["bad_osds"]))
|
||||
bad_osds.insert(osd_num);
|
||||
no_check = json_is_false(cfg["check"]);
|
||||
if (cfg["part"].is_number() || cfg["part"].is_string())
|
||||
part = cfg["part"].uint64_value();
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
if (state == 100)
|
||||
return;
|
||||
parse_options(options);
|
||||
if (!objects.size())
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Object list is not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!bad_osds.size())
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "OSDs are not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
remove_duplicates(objects);
|
||||
parent->cli->init_msgr();
|
||||
resume_1:
|
||||
state = 1;
|
||||
while (processed_count < objects.size())
|
||||
{
|
||||
if (parent->waiting >= parent->iodepth*parent->parallel_osds)
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto & obj = objects[processed_count++];
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Object %lx:%lx is from unknown pool\n", obj.inode, obj.stripe);
|
||||
continue;
|
||||
}
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
pg_num_t pg_num = (obj.stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
|
||||
auto pg_it = pool_cfg.pg_config.find(pg_num);
|
||||
if (pg_it == pool_cfg.pg_config.end() ||
|
||||
!pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Object %lx:%lx is from PG %u/%u which is not currently active\n",
|
||||
obj.inode, obj.stripe, pool_cfg_it->first, pg_num
|
||||
);
|
||||
continue;
|
||||
}
|
||||
osd_num_t primary_osd = pg_it->second.cur_primary;
|
||||
// Describe -> Remove some copies -> Scrub again
|
||||
osd_op_t *op = new osd_op_t;
|
||||
op->req = (osd_any_op_t){
|
||||
.describe = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DESCRIBE,
|
||||
},
|
||||
.min_inode = obj.inode,
|
||||
.min_offset = obj.stripe,
|
||||
.max_inode = obj.inode,
|
||||
.max_offset = obj.stripe,
|
||||
},
|
||||
};
|
||||
op->callback = [this, primary_osd, &obj](osd_op_t *op)
|
||||
{
|
||||
if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||
{
|
||||
fprintf(stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n", primary_osd, op->reply.hdr.retval);
|
||||
parent->waiting--;
|
||||
loop();
|
||||
}
|
||||
else
|
||||
{
|
||||
osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
|
||||
int *rm_count = (int*)malloc_or_die(sizeof(int));
|
||||
*rm_count = 1; // just in case if anything gets called instantly
|
||||
for (int i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
if (((items[i].loc_bad & LOC_INCONSISTENT) || no_check) &&
|
||||
bad_osds.find(items[i].osd_num) != bad_osds.end() &&
|
||||
(part == -1 || items[i].role == part))
|
||||
{
|
||||
// Remove
|
||||
uint64_t rm_osd_num = items[i].osd_num;
|
||||
osd_op_t *rm_op = new osd_op_t;
|
||||
rm_op->req = (osd_any_op_t){
|
||||
.sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_SEC_DELETE,
|
||||
},
|
||||
.oid = {
|
||||
.inode = op->req.describe.min_inode,
|
||||
.stripe = op->req.describe.min_offset | items[i].role,
|
||||
},
|
||||
.version = 0,
|
||||
},
|
||||
};
|
||||
rm_op->callback = [this, primary_osd, rm_osd_num, rm_count, &obj](osd_op_t *rm_op)
|
||||
{
|
||||
(*rm_count)--;
|
||||
if (rm_op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to remove object %lx:%lx from OSD %lu (retval=%ld)\n",
|
||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
|
||||
rm_osd_num, rm_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (parent->json_output)
|
||||
{
|
||||
fix_result.push_back(json11::Json::object {
|
||||
{ "inode", (uint64_t)rm_op->req.sec_del.oid.inode },
|
||||
{ "stripe", (uint64_t)rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK },
|
||||
{ "part", (uint64_t)rm_op->req.sec_del.oid.stripe & STRIPE_MASK },
|
||||
{ "osd_num", (uint64_t)rm_osd_num },
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Removed %lx:%lx (part %lu) from OSD %lu\n",
|
||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
|
||||
rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
|
||||
);
|
||||
}
|
||||
delete rm_op;
|
||||
if (!(*rm_count))
|
||||
{
|
||||
// Scrub
|
||||
free(rm_count);
|
||||
osd_op_t *scrub_op = new osd_op_t;
|
||||
scrub_op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_SCRUB,
|
||||
},
|
||||
.inode = obj.inode,
|
||||
.offset = obj.stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
scrub_op->callback = [this, primary_osd, &obj](osd_op_t *scrub_op)
|
||||
{
|
||||
if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to scrub %lx:%lx on OSD %lu (retval=%ld)\n",
|
||||
obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
delete scrub_op;
|
||||
parent->waiting--;
|
||||
loop();
|
||||
};
|
||||
parent->cli->execute_raw(primary_osd, scrub_op);
|
||||
}
|
||||
};
|
||||
(*rm_count)++;
|
||||
parent->cli->execute_raw(rm_osd_num, rm_op);
|
||||
}
|
||||
}
|
||||
(*rm_count)--;
|
||||
if (!*rm_count)
|
||||
{
|
||||
free(rm_count);
|
||||
parent->waiting--;
|
||||
loop();
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
};
|
||||
parent->waiting++;
|
||||
parent->cli->execute_raw(primary_osd, op);
|
||||
}
|
||||
if (parent->waiting > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (parent->json_output)
|
||||
{
|
||||
result.data = fix_result;
|
||||
}
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_fix(json11::Json cfg)
|
||||
{
|
||||
auto fixer = new cli_fix_t();
|
||||
fixer->parent = this;
|
||||
fixer->options = cfg;
|
||||
return [fixer](cli_result_t & result)
|
||||
{
|
||||
fixer->loop();
|
||||
if (fixer->is_done())
|
||||
{
|
||||
result = fixer->result;
|
||||
delete fixer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -1,26 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cli.h"
|
||||
#include <algorithm>
|
||||
|
||||
std::vector<uint64_t> parse_uint64_list(json11::Json val);
|
||||
|
||||
template<class T> void remove_duplicates(std::vector<T> & ret)
|
||||
{
|
||||
if (!ret.size())
|
||||
return;
|
||||
std::sort(ret.begin(), ret.end());
|
||||
int j = 0;
|
||||
for (int i = 1; i < ret.size(); i++)
|
||||
{
|
||||
if (ret[i] != ret[j])
|
||||
ret[++j] = ret[i];
|
||||
}
|
||||
ret.resize(j+1);
|
||||
}
|
||||
|
||||
// from http_client.cpp...
|
||||
bool json_is_false(const json11::Json & val);
|
@@ -410,17 +410,14 @@ struct rm_osd_t
|
||||
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
|
||||
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
|
||||
);
|
||||
auto hist = json11::Json::object {
|
||||
{ "epoch", pg_cfg.epoch },
|
||||
{ "all_peers", pg_cfg.all_peers },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
};
|
||||
if (pg_cfg.next_scrub)
|
||||
hist["next_scrub"] = pg_cfg.next_scrub;
|
||||
history_updates.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", history_key },
|
||||
{ "value", base64_encode(json11::Json(hist).dump()) },
|
||||
{ "value", base64_encode(json11::Json(json11::Json::object {
|
||||
{ "epoch", pg_cfg.epoch },
|
||||
{ "all_peers", pg_cfg.all_peers },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
}).dump()) },
|
||||
} },
|
||||
});
|
||||
history_checks.push_back(json11::Json::object {
|
||||
|
@@ -201,7 +201,6 @@ resume_2:
|
||||
bool readonly = json_is_true(parent->cli->config["readonly"]);
|
||||
bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
|
||||
bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
|
||||
bool no_scrub = json_is_true(parent->cli->config["no_scrub"]);
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
@@ -220,7 +219,6 @@ resume_2:
|
||||
{ "readonly", readonly },
|
||||
{ "no_recovery", no_recovery },
|
||||
{ "no_rebalance", no_rebalance },
|
||||
{ "no_scrub", no_scrub },
|
||||
{ "pool_count", pool_count },
|
||||
{ "active_pool_count", pools_active },
|
||||
{ "pg_states", pgs_by_state },
|
||||
|
@@ -35,7 +35,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
// peer_osd just connected
|
||||
continue_ops();
|
||||
continue_lists();
|
||||
continue_raw_ops(peer_osd);
|
||||
}
|
||||
else if (dirty_buffers.size())
|
||||
{
|
||||
@@ -105,19 +104,6 @@ cluster_op_t::~cluster_op_t()
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_raw_ops(osd_num_t peer_osd)
|
||||
{
|
||||
auto it = raw_ops.find(peer_osd);
|
||||
while (it != raw_ops.end() && it->first == peer_osd)
|
||||
{
|
||||
auto op = it->second;
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds.at(peer_osd);
|
||||
msgr.outbox_push(op);
|
||||
raw_ops.erase(it++);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::init_msgr()
|
||||
{
|
||||
if (msgr_initialized)
|
||||
@@ -526,23 +512,6 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
|
||||
{
|
||||
auto fd_it = msgr.osd_peer_fds.find(osd_num);
|
||||
if (fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = fd_it->second;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (msgr.wanted_peers.find(osd_num) == msgr.wanted_peers.end())
|
||||
msgr.connect_peer(osd_num, st_cli.peer_states[osd_num]);
|
||||
raw_ops.emplace(osd_num, op);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
|
||||
{
|
||||
// Save operation for replay when one of PGs goes out of sync
|
||||
|
@@ -103,7 +103,6 @@ class cluster_client_t
|
||||
ring_consumer_t consumer;
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
std::vector<inode_list_t*> lists;
|
||||
std::multimap<osd_num_t, osd_op_t*> raw_ops;
|
||||
int continuing_ops = 0;
|
||||
bool msgr_initialized = false;
|
||||
|
||||
@@ -119,7 +118,6 @@ public:
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||
~cluster_client_t();
|
||||
void execute(cluster_op_t *op);
|
||||
void execute_raw(osd_num_t osd_num, osd_op_t *op);
|
||||
bool is_ready();
|
||||
void on_ready(std::function<void(void)> fn);
|
||||
|
||||
@@ -155,5 +153,4 @@ protected:
|
||||
void continue_lists();
|
||||
void continue_listing(inode_list_t *lst);
|
||||
void send_list(inode_list_osd_t *cur_list);
|
||||
void continue_raw_ops(osd_num_t peer_osd);
|
||||
};
|
||||
|
@@ -55,6 +55,23 @@ std::string realpath_str(std::string path, bool nofail)
|
||||
return rp;
|
||||
}
|
||||
|
||||
std::string read_all_fd(int fd)
|
||||
{
|
||||
int res_size = 0;
|
||||
std::string res;
|
||||
while (1)
|
||||
{
|
||||
res.resize(res_size+1024);
|
||||
int r = read(fd, (char*)res.data()+res_size, res.size()-res_size);
|
||||
if (r > 0)
|
||||
res_size += r;
|
||||
else if (!r || errno != EAGAIN && errno != EINTR)
|
||||
break;
|
||||
}
|
||||
res.resize(res_size);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string read_file(std::string file, bool allow_enoent)
|
||||
{
|
||||
std::string res;
|
||||
|
@@ -7,8 +7,8 @@
|
||||
#ifndef __MOCK__
|
||||
#include "addr_util.h"
|
||||
#include "http_client.h"
|
||||
#endif
|
||||
#include "str_util.h"
|
||||
#endif
|
||||
|
||||
etcd_state_client_t::~etcd_state_client_t()
|
||||
{
|
||||
@@ -777,10 +777,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
|
||||
continue;
|
||||
}
|
||||
// Scrub Interval
|
||||
pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
|
||||
if (!pc.scrub_interval)
|
||||
pc.scrub_interval = 0;
|
||||
// Immediate Commit Mode
|
||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||
? (pool_item.second["immediate_commit"].string_value() == "all"
|
||||
@@ -923,8 +919,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
// Read epoch
|
||||
pg_cfg.epoch = value["epoch"].uint64_value();
|
||||
// Next scrub timestamp (0 or empty = scrub is not needed)
|
||||
pg_cfg.next_scrub = value["next_scrub"].uint64_value();
|
||||
if (on_change_pg_history_hook != NULL)
|
||||
{
|
||||
on_change_pg_history_hook(pool_id, pg_num);
|
||||
|
@@ -39,7 +39,6 @@ struct pg_config_t
|
||||
osd_num_t cur_primary;
|
||||
int cur_state;
|
||||
uint64_t epoch;
|
||||
uint64_t next_scrub;
|
||||
};
|
||||
|
||||
struct pool_config_t
|
||||
@@ -56,7 +55,6 @@ struct pool_config_t
|
||||
uint64_t max_osd_combinations;
|
||||
uint64_t pg_stripe_size;
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
uint64_t scrub_interval;
|
||||
};
|
||||
|
||||
struct inode_config_t
|
||||
|
@@ -251,10 +251,6 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
return;
|
||||
}
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Connecting to OSD %lu at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
|
||||
}
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = peer_port;
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
@@ -317,10 +313,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
// Stop client
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
|
||||
}
|
||||
fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
|
||||
stop_client(peer_fd, true);
|
||||
}
|
||||
else if (epoll_events & EPOLLIN)
|
||||
|
@@ -50,7 +50,7 @@ struct osd_client_t
|
||||
|
||||
sockaddr_storage peer_addr;
|
||||
int peer_port;
|
||||
int peer_fd = -1;
|
||||
int peer_fd;
|
||||
int peer_state;
|
||||
int connect_timeout_id = -1;
|
||||
int ping_time_remaining = 0;
|
||||
@@ -87,7 +87,11 @@ struct osd_client_t
|
||||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||
|
||||
~osd_client_t();
|
||||
~osd_client_t()
|
||||
{
|
||||
free(in_buf);
|
||||
in_buf = NULL;
|
||||
}
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
|
@@ -251,6 +251,10 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
}
|
||||
cl->read_remaining = cur_op->req.sec_read_bmp.len;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
cl->read_remaining = 0;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
||||
{
|
||||
if (cur_op->req.rw.len > 0)
|
||||
@@ -270,12 +274,6 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
}
|
||||
cl->read_remaining = cur_op->req.show_conf.json_len;
|
||||
}
|
||||
/*else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SCRUB ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
{
|
||||
cl->read_remaining = 0;
|
||||
}*/
|
||||
if (cl->read_remaining > 0)
|
||||
{
|
||||
// Read data
|
||||
@@ -369,16 +367,6 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
op->buf = malloc_or_die(op->reply.hdr.retval);
|
||||
cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
|
||||
}
|
||||
else if (op->reply.hdr.opcode == OSD_OP_DESCRIBE && op->reply.hdr.retval > 0)
|
||||
{
|
||||
delete cl->read_op;
|
||||
cl->read_op = op;
|
||||
cl->read_state = CL_READ_REPLY_DATA;
|
||||
cl->read_remaining = op->reply.describe.result_bytes;
|
||||
free(op->buf);
|
||||
op->buf = malloc_or_die(op->reply.describe.result_bytes);
|
||||
cl->recv_list.push_back(op->buf, op->reply.describe.result_bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
reuse:
|
||||
|
@@ -73,8 +73,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
? (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
|
||||
: (cur_op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
|
||||
|
@@ -122,6 +122,17 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
// Cancel outbound operations
|
||||
cancel_osd_ops(cl);
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// And close the FD only when everything is done
|
||||
// ...because peer_fd number can get reused after close()
|
||||
close(peer_fd);
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->rdma_conn)
|
||||
{
|
||||
delete cl->rdma_conn;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
// Find the item again because it can be invalidated at this point
|
||||
it = clients.find(peer_fd);
|
||||
if (it != clients.end())
|
||||
@@ -134,25 +145,3 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
delete cl;
|
||||
}
|
||||
}
|
||||
|
||||
osd_client_t::~osd_client_t()
|
||||
{
|
||||
free(in_buf);
|
||||
in_buf = NULL;
|
||||
if (peer_fd >= 0)
|
||||
{
|
||||
// Close the FD only when the client is actually destroyed
|
||||
// Which only happens when all references are cleared
|
||||
close(peer_fd);
|
||||
peer_fd = -1;
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_conn)
|
||||
{
|
||||
delete rdma_conn;
|
||||
rdma_conn = NULL;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@@ -137,19 +137,12 @@ public:
|
||||
"OPTIONS:\n"
|
||||
" All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
|
||||
" --nbd_timeout 30\n"
|
||||
" Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
|
||||
" the device. You can set it to 0 to disable the timeout, but beware that you\n"
|
||||
" won't be able to stop the device at all if vitastor-nbd process dies.\n"
|
||||
" timeout in seconds after which the kernel will stop the device\n"
|
||||
" you can set it to 0, but beware that you won't be able to stop the device at all\n"
|
||||
" if vitastor-nbd process dies\n"
|
||||
" --nbd_max_devices 64 --nbd_max_part 3\n"
|
||||
" Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
||||
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
|
||||
" --logfile /path/to/log/file.txt\n"
|
||||
" Wite log messages to the specified file instead of dropping them (in background mode)\n"
|
||||
" or printing them to the standard output (in foreground mode).\n"
|
||||
" --dev_num N\n"
|
||||
" Use the specified device /dev/nbdN instead of automatic selection.\n"
|
||||
" --foreground 1\n"
|
||||
" Stay in foreground, do not daemonize.n",
|
||||
" options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
||||
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n",
|
||||
exe_name, exe_name, exe_name
|
||||
);
|
||||
exit(0);
|
||||
|
56
src/osd.cpp
56
src/osd.cpp
@@ -13,7 +13,6 @@
|
||||
#include "osd_primary.h"
|
||||
#include "osd.h"
|
||||
#include "http_client.h"
|
||||
#include "str_util.h"
|
||||
|
||||
static blockstore_config_t json_to_bs(const json11::Json::object & config)
|
||||
{
|
||||
@@ -169,8 +168,6 @@ void osd_t::parse_config(bool init)
|
||||
no_rebalance = json_is_true(config["no_rebalance"]);
|
||||
auto old_no_recovery = no_recovery;
|
||||
no_recovery = json_is_true(config["no_recovery"]);
|
||||
auto old_no_scrub = no_scrub;
|
||||
no_scrub = json_is_true(config["no_scrub"]);
|
||||
auto old_autosync_interval = autosync_interval;
|
||||
if (!config["autosync_interval"].is_null())
|
||||
{
|
||||
@@ -210,38 +207,6 @@ void osd_t::parse_config(bool init)
|
||||
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
||||
if (!inode_vanish_time)
|
||||
inode_vanish_time = 60;
|
||||
auto old_auto_scrub = auto_scrub;
|
||||
auto_scrub = json_is_true(config["auto_scrub"]);
|
||||
global_scrub_interval = parse_time(config["scrub_interval"].string_value());
|
||||
if (!global_scrub_interval)
|
||||
global_scrub_interval = 30*86400;
|
||||
scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
|
||||
if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
scrub_queue_depth = 1;
|
||||
scrub_find_best = !json_is_false(config["scrub_find_best"]);
|
||||
scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value();
|
||||
if (scrub_ec_max_bruteforce < 1)
|
||||
scrub_ec_max_bruteforce = 100;
|
||||
scrub_sleep_ms = config["scrub_sleep"].uint64_value();
|
||||
scrub_list_limit = config["scrub_list_limit"].uint64_value();
|
||||
if (!scrub_list_limit)
|
||||
scrub_list_limit = 1000;
|
||||
if (!old_auto_scrub && auto_scrub)
|
||||
{
|
||||
// Schedule scrubbing
|
||||
for (auto & pgp: pgs)
|
||||
{
|
||||
plan_scrub(pgp.second);
|
||||
}
|
||||
}
|
||||
if (old_no_scrub && !no_scrub)
|
||||
{
|
||||
// Wakeup scrubbing
|
||||
for (auto & pgp: pgs)
|
||||
{
|
||||
schedule_scrub(pgp.second);
|
||||
}
|
||||
}
|
||||
if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
|
||||
!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
||||
{
|
||||
@@ -372,8 +337,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_DESCRIBE &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
||||
{
|
||||
// Readonly mode
|
||||
@@ -404,14 +367,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
{
|
||||
continue_primary_del(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||
{
|
||||
continue_primary_scrub(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
{
|
||||
continue_primary_describe(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
exec_secondary(cur_op);
|
||||
@@ -476,10 +431,6 @@ void osd_t::print_stats()
|
||||
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
|
||||
}
|
||||
}
|
||||
if (corrupted_objects > 0)
|
||||
{
|
||||
printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
|
||||
}
|
||||
if (incomplete_objects > 0)
|
||||
{
|
||||
printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
|
||||
@@ -547,11 +498,10 @@ void osd_t::print_slow()
|
||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||
{
|
||||
bufprintf(
|
||||
" oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
|
||||
op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
|
||||
op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
|
||||
" inode=%lx-%lx pg=%u/%u, stripe=%lu",
|
||||
op->req.sec_list.min_inode, op->req.sec_list.max_inode,
|
||||
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
|
||||
op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
|
||||
op->req.sec_list.pg_stripe_size
|
||||
);
|
||||
}
|
||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
|
41
src/osd.h
41
src/osd.h
@@ -28,7 +28,6 @@
|
||||
#define OSD_PEERING_PGS 0x04
|
||||
#define OSD_FLUSHING_PGS 0x08
|
||||
#define OSD_RECOVERING 0x10
|
||||
#define OSD_SCRUBBING 0x20
|
||||
|
||||
#define MAX_AUTOSYNC_INTERVAL 3600
|
||||
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
||||
@@ -99,7 +98,6 @@ class osd_t
|
||||
bool run_primary = false;
|
||||
bool no_rebalance = false;
|
||||
bool no_recovery = false;
|
||||
bool no_scrub = false;
|
||||
std::string bind_address;
|
||||
int bind_port, listen_backlog = 128;
|
||||
// FIXME: Implement client queue depth limit
|
||||
@@ -115,13 +113,6 @@ class osd_t
|
||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
int inode_vanish_time = 60;
|
||||
int log_level = 0;
|
||||
bool auto_scrub = false;
|
||||
uint64_t global_scrub_interval = 30*86400;
|
||||
uint64_t scrub_queue_depth = 1;
|
||||
uint64_t scrub_sleep_ms = 0;
|
||||
uint32_t scrub_list_limit = 1000;
|
||||
bool scrub_find_best = true;
|
||||
uint64_t scrub_ec_max_bruteforce = 100;
|
||||
|
||||
// cluster state
|
||||
|
||||
@@ -144,24 +135,15 @@ class osd_t
|
||||
std::set<pool_pg_num_t> dirty_pgs;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
int copies_to_delete_after_sync_count = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
||||
int peering_state = 0;
|
||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||
std::map<object_id, osd_op_t*> scrub_ops;
|
||||
bool recovery_last_degraded = true;
|
||||
pool_pg_num_t recovery_last_pg;
|
||||
object_id recovery_last_oid;
|
||||
int recovery_pg_done = 0, recovery_done = 0;
|
||||
osd_op_t *autosync_op = NULL;
|
||||
|
||||
// Scrubbing
|
||||
uint64_t scrub_nearest_ts = 0;
|
||||
int scrub_timer_id = -1;
|
||||
pool_pg_num_t scrub_last_pg = {};
|
||||
osd_op_t *scrub_list_op = NULL;
|
||||
pg_list_result_t scrub_cur_list = {};
|
||||
uint64_t scrub_list_pos = 0;
|
||||
|
||||
// Unstable writes
|
||||
uint64_t unstable_write_count = 0;
|
||||
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
||||
@@ -239,14 +221,6 @@ class osd_t
|
||||
bool continue_recovery();
|
||||
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
|
||||
|
||||
// scrub
|
||||
void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
|
||||
int pick_next_scrub(object_id & next_oid);
|
||||
void submit_scrub_op(object_id oid);
|
||||
bool continue_scrub();
|
||||
void plan_scrub(pg_t & pg, bool report_state = true);
|
||||
void schedule_scrub(pg_t & pg);
|
||||
|
||||
// op execution
|
||||
void exec_op(osd_op_t *cur_op);
|
||||
void finish_op(osd_op_t *cur_op, int retval);
|
||||
@@ -261,19 +235,13 @@ class osd_t
|
||||
void autosync();
|
||||
bool prepare_primary_rw(osd_op_t *cur_op);
|
||||
void continue_primary_read(osd_op_t *cur_op);
|
||||
void continue_primary_scrub(osd_op_t *cur_op);
|
||||
void continue_primary_describe(osd_op_t *cur_op);
|
||||
void continue_primary_write(osd_op_t *cur_op);
|
||||
void cancel_primary_write(osd_op_t *cur_op);
|
||||
void continue_primary_sync(osd_op_t *cur_op);
|
||||
void continue_primary_del(osd_op_t *cur_op);
|
||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||
pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
||||
uint64_t old_pg_state, int log_at_level);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
|
||||
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
|
||||
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
||||
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
void handle_primary_bs_subop(osd_op_t *subop);
|
||||
@@ -288,11 +256,10 @@ class osd_t
|
||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
|
||||
|
||||
void continue_chained_read(osd_op_t *cur_op);
|
||||
int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
|
||||
void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
|
||||
void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
|
||||
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
|
||||
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
|
||||
|
@@ -337,8 +337,6 @@ void osd_t::report_statistics()
|
||||
pg_stats["misplaced_count"] = pg.misplaced_objects.size();
|
||||
pg_stats["degraded_count"] = pg.degraded_objects.size();
|
||||
pg_stats["incomplete_count"] = pg.incomplete_objects.size();
|
||||
if (pg.corrupted_count)
|
||||
pg_stats["corrupted_count"] = pg.corrupted_count;
|
||||
pg_stats["write_osd_set"] = pg.cur_set;
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
@@ -694,11 +692,6 @@ void osd_t::apply_pg_config()
|
||||
pg_it->second.all_peers == vec_all_peers)
|
||||
{
|
||||
// No change in osd_set and history
|
||||
if (pg_it->second.next_scrub != pg_cfg.next_scrub)
|
||||
{
|
||||
pg_it->second.next_scrub = pg_cfg.next_scrub;
|
||||
schedule_scrub(pg_it->second);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
else
|
||||
@@ -750,7 +743,6 @@ void osd_t::apply_pg_config()
|
||||
.reported_epoch = pg_cfg.epoch,
|
||||
.target_history = pg_cfg.target_history,
|
||||
.all_peers = vec_all_peers,
|
||||
.next_scrub = pg_cfg.next_scrub,
|
||||
.target_set = pg_cfg.target_set,
|
||||
};
|
||||
if (pg.scheme == POOL_SCHEME_EC)
|
||||
@@ -891,8 +883,6 @@ void osd_t::report_pg_states()
|
||||
{ "all_peers", pg.all_peers },
|
||||
{ "osd_sets", pg.target_history },
|
||||
};
|
||||
if (pg.next_scrub)
|
||||
history_value["next_scrub"] = pg.next_scrub;
|
||||
checks.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", history_key },
|
||||
|
@@ -192,9 +192,7 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
op->bs_op = NULL;
|
||||
delete op;
|
||||
},
|
||||
{
|
||||
.len = (uint32_t)count,
|
||||
},
|
||||
.len = (uint32_t)count,
|
||||
.buf = op->buf,
|
||||
});
|
||||
bs->enqueue_op(op->bs_op);
|
||||
@@ -305,25 +303,27 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||
};
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf("Submitting recovery operation for %lx:%lx (%s)\n", op->oid.inode, op->oid.stripe, op->degraded ? "degraded" : "misplaced");
|
||||
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
||||
}
|
||||
op->osd_op->peer_fd = -1;
|
||||
op->osd_op->callback = [this, op](osd_op_t *osd_op)
|
||||
{
|
||||
if (osd_op->reply.hdr.retval < 0)
|
||||
{
|
||||
// Error recovering object
|
||||
// EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
|
||||
printf(
|
||||
"[PG %u/%u] Recovery operation failed with object %lx:%lx: error %ld\n",
|
||||
INODE_POOL(op->oid.inode),
|
||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
||||
op->oid.inode, op->oid.stripe, osd_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (log_level > 2)
|
||||
{
|
||||
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
||||
if (osd_op->reply.hdr.retval == -EPIPE)
|
||||
{
|
||||
// PG is stopped or one of the OSDs is gone, error is harmless
|
||||
printf(
|
||||
"[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
|
||||
INODE_POOL(op->oid.inode),
|
||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
||||
op->oid.inode, op->oid.stripe
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Failed to recover an object");
|
||||
}
|
||||
}
|
||||
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
|
||||
op->osd_op = NULL;
|
||||
|
@@ -21,6 +21,4 @@ const char* osd_op_names[] = {
|
||||
"primary_delete",
|
||||
"ping",
|
||||
"sec_read_bmp",
|
||||
"scrub",
|
||||
"describe",
|
||||
};
|
||||
|
@@ -29,9 +29,7 @@
|
||||
#define OSD_OP_DELETE 14
|
||||
#define OSD_OP_PING 15
|
||||
#define OSD_OP_SEC_READ_BMP 16
|
||||
#define OSD_OP_SCRUB 17
|
||||
#define OSD_OP_DESCRIBE 18
|
||||
#define OSD_OP_MAX 18
|
||||
#define OSD_OP_MAX 16
|
||||
#define OSD_RW_MAX 64*1024*1024
|
||||
#define OSD_PROTOCOL_VERSION 1
|
||||
|
||||
@@ -45,11 +43,6 @@
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
// Constants for osd_reply_describe_item_t.loc_bad
|
||||
#define LOC_OUTDATED 1
|
||||
#define LOC_CORRUPTED 2
|
||||
#define LOC_INCONSISTENT 4
|
||||
|
||||
// common request and reply headers
|
||||
struct __attribute__((__packed__)) osd_op_header_t
|
||||
{
|
||||
@@ -180,11 +173,6 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
|
||||
uint64_t pg_stripe_size;
|
||||
// inode range (used to select pools)
|
||||
uint64_t min_inode, max_inode;
|
||||
// min/max oid stripe, added after inodes for backwards compatibility
|
||||
// also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o
|
||||
uint64_t min_stripe, max_stripe;
|
||||
// max stable object count
|
||||
uint32_t stable_limit;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_sec_list_t
|
||||
@@ -235,36 +223,6 @@ struct __attribute__((__packed__)) osd_reply_sync_t
|
||||
osd_reply_header_t header;
|
||||
};
|
||||
|
||||
// describe unclean object states in detail
|
||||
struct __attribute__((__packed__)) osd_op_describe_t
|
||||
{
|
||||
osd_op_header_t header;
|
||||
// state mask to filter objects by state (0 or 0xfff..ff = all objects)
|
||||
uint64_t object_state;
|
||||
// minimum inode and offset
|
||||
uint64_t min_inode, min_offset;
|
||||
// maximum inode and offset
|
||||
uint64_t max_inode, max_offset;
|
||||
// limit
|
||||
uint64_t limit;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_describe_t
|
||||
{
|
||||
osd_reply_header_t header;
|
||||
// size of the resulting <osd_reply_describe_item_t> array in bytes
|
||||
uint64_t result_bytes;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_describe_item_t
|
||||
{
|
||||
uint64_t inode;
|
||||
uint64_t stripe;
|
||||
uint32_t role; // part number: 0 for replicas, 0..pg_size-1 for EC
|
||||
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
|
||||
osd_num_t osd_num; // OSD number
|
||||
};
|
||||
|
||||
// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
|
||||
union osd_any_op_t
|
||||
{
|
||||
@@ -278,7 +236,6 @@ union osd_any_op_t
|
||||
osd_op_show_config_t show_conf;
|
||||
osd_op_rw_t rw;
|
||||
osd_op_sync_t sync;
|
||||
osd_op_describe_t describe;
|
||||
uint8_t buf[OSD_PACKET_SIZE];
|
||||
};
|
||||
|
||||
@@ -294,7 +251,6 @@ union osd_any_reply_t
|
||||
osd_reply_show_config_t show_conf;
|
||||
osd_reply_rw_t rw;
|
||||
osd_reply_sync_t sync;
|
||||
osd_reply_describe_t describe;
|
||||
uint8_t buf[OSD_PACKET_SIZE];
|
||||
};
|
||||
|
||||
|
@@ -25,7 +25,6 @@ void osd_t::handle_peers()
|
||||
{
|
||||
p.second.calc_object_states(log_level);
|
||||
report_pg_state(p.second);
|
||||
schedule_scrub(p.second);
|
||||
incomplete_objects += p.second.incomplete_objects.size();
|
||||
misplaced_objects += p.second.misplaced_objects.size();
|
||||
// FIXME: degraded objects may currently include misplaced, too! Report them separately?
|
||||
@@ -84,13 +83,6 @@ void osd_t::handle_peers()
|
||||
peering_state = peering_state & ~OSD_RECOVERING;
|
||||
}
|
||||
}
|
||||
if (peering_state & OSD_SCRUBBING)
|
||||
{
|
||||
if (!continue_scrub())
|
||||
{
|
||||
peering_state = peering_state & ~OSD_SCRUBBING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
@@ -136,11 +128,9 @@ void osd_t::reset_pg(pg_t & pg)
|
||||
pg.state_dict.clear();
|
||||
copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
||||
pg.copies_to_delete_after_sync.clear();
|
||||
corrupted_objects -= pg.corrupted_count;
|
||||
incomplete_objects -= pg.incomplete_objects.size();
|
||||
misplaced_objects -= pg.misplaced_objects.size();
|
||||
degraded_objects -= pg.degraded_objects.size();
|
||||
pg.corrupted_count = 0;
|
||||
pg.incomplete_objects.clear();
|
||||
pg.misplaced_objects.clear();
|
||||
pg.degraded_objects.clear();
|
||||
@@ -216,7 +206,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
pg.cur_loc_set.push_back({
|
||||
.role = (uint64_t)role,
|
||||
.osd_num = pg.cur_set[role],
|
||||
.loc_bad = 0,
|
||||
.outdated = false,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -329,12 +319,11 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||
op->bs_op = new blockstore_op_t();
|
||||
op->bs_op->opcode = BS_OP_LIST;
|
||||
op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size;
|
||||
op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
|
||||
op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||
op->bs_op->max_oid.stripe = UINT64_MAX;
|
||||
op->bs_op->pg_count = pg_counts[ps->pool_id];
|
||||
op->bs_op->pg_number = ps->pg_num-1;
|
||||
op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
|
||||
op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
|
||||
op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||
op->bs_op->len = pg_counts[ps->pool_id];
|
||||
op->bs_op->offset = ps->pg_num-1;
|
||||
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
|
||||
{
|
||||
if (op->bs_op->retval < 0)
|
||||
@@ -494,7 +483,6 @@ void osd_t::report_pg_state(pg_t & pg)
|
||||
pg.all_peers = pg.target_set;
|
||||
std::sort(pg.all_peers.begin(), pg.all_peers.end());
|
||||
pg.cur_peers = pg.target_set;
|
||||
plan_scrub(pg, false);
|
||||
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
|
||||
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
|
||||
pg_cfg.target_history = pg.target_history;
|
||||
@@ -538,7 +526,6 @@ void osd_t::report_pg_state(pg_t & pg)
|
||||
pg.cur_peers.push_back(pg_osd);
|
||||
}
|
||||
}
|
||||
plan_scrub(pg, false);
|
||||
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
|
||||
pg_cfg.target_history = pg.target_history;
|
||||
pg_cfg.all_peers = pg.all_peers;
|
||||
|
@@ -255,7 +255,7 @@ void pg_obj_state_check_t::finish_object()
|
||||
}
|
||||
else if (n_mismatched > 0)
|
||||
{
|
||||
if (log_level > 2)
|
||||
if (log_level > 2 && (replicated || n_roles >= pg->pg_cursize))
|
||||
{
|
||||
printf("Object is misplaced: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||
}
|
||||
@@ -280,7 +280,7 @@ void pg_obj_state_check_t::finish_object()
|
||||
osd_set.push_back((pg_obj_loc_t){
|
||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||
.osd_num = list[i].osd_num,
|
||||
.loc_bad = 0,
|
||||
.outdated = false,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -302,7 +302,7 @@ void pg_obj_state_check_t::finish_object()
|
||||
osd_set.push_back((pg_obj_loc_t){
|
||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||
.osd_num = list[i].osd_num,
|
||||
.loc_bad = LOC_OUTDATED,
|
||||
.outdated = true,
|
||||
});
|
||||
if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
|
||||
{
|
||||
@@ -322,75 +322,65 @@ void pg_obj_state_check_t::finish_object()
|
||||
}
|
||||
else
|
||||
{
|
||||
pg->add_object_to_state(oid, state, osd_set);
|
||||
}
|
||||
}
|
||||
|
||||
pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set)
|
||||
{
|
||||
auto it = state_dict.find(osd_set);
|
||||
if (it == state_dict.end())
|
||||
{
|
||||
std::vector<osd_num_t> read_target;
|
||||
if (scheme == POOL_SCHEME_REPLICATED)
|
||||
auto it = pg->state_dict.find(osd_set);
|
||||
if (it == pg->state_dict.end())
|
||||
{
|
||||
for (auto & o: osd_set)
|
||||
std::vector<uint64_t> read_target;
|
||||
if (replicated)
|
||||
{
|
||||
if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
read_target.push_back(o.osd_num);
|
||||
if (!o.outdated)
|
||||
{
|
||||
read_target.push_back(o.osd_num);
|
||||
}
|
||||
}
|
||||
while (read_target.size() < pg->pg_size)
|
||||
{
|
||||
// FIXME: This is because we then use .data() and assume it's at least <pg_size> long
|
||||
read_target.push_back(0);
|
||||
}
|
||||
}
|
||||
while (read_target.size() < pg_size)
|
||||
else
|
||||
{
|
||||
// FIXME: This is because we then use .data() and assume it's at least <pg_size> long
|
||||
read_target.push_back(0);
|
||||
read_target.resize(pg->pg_size);
|
||||
for (int i = 0; i < pg->pg_size; i++)
|
||||
{
|
||||
read_target[i] = 0;
|
||||
}
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.outdated)
|
||||
{
|
||||
read_target[o.role] = o.osd_num;
|
||||
}
|
||||
}
|
||||
}
|
||||
pg->state_dict[osd_set] = {
|
||||
.read_target = read_target,
|
||||
.osd_set = osd_set,
|
||||
.state = state,
|
||||
.object_count = 1,
|
||||
};
|
||||
it = pg->state_dict.find(osd_set);
|
||||
}
|
||||
else
|
||||
{
|
||||
read_target.resize(pg_size);
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
read_target[i] = 0;
|
||||
}
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
|
||||
{
|
||||
read_target[o.role] = o.osd_num;
|
||||
}
|
||||
}
|
||||
it->second.object_count++;
|
||||
}
|
||||
if (state & OBJ_INCOMPLETE)
|
||||
{
|
||||
pg->incomplete_objects[oid] = &it->second;
|
||||
}
|
||||
else if (state & OBJ_DEGRADED)
|
||||
{
|
||||
pg->degraded_objects[oid] = &it->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
pg->misplaced_objects[oid] = &it->second;
|
||||
}
|
||||
state_dict[osd_set] = {
|
||||
.read_target = read_target,
|
||||
.osd_set = osd_set,
|
||||
.state = state,
|
||||
.object_count = 1,
|
||||
};
|
||||
it = state_dict.find(osd_set);
|
||||
}
|
||||
else
|
||||
{
|
||||
it->second.object_count++;
|
||||
}
|
||||
if (state & OBJ_INCONSISTENT)
|
||||
{
|
||||
inconsistent_objects[oid] = &it->second;
|
||||
}
|
||||
else if (state & OBJ_INCOMPLETE)
|
||||
{
|
||||
incomplete_objects[oid] = &it->second;
|
||||
}
|
||||
else if (state & OBJ_DEGRADED)
|
||||
{
|
||||
degraded_objects[oid] = &it->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
misplaced_objects[oid] = &it->second;
|
||||
}
|
||||
return &it->second;
|
||||
}
|
||||
|
||||
// FIXME: Write at least some tests for this function
|
||||
@@ -456,9 +446,7 @@ void pg_t::calc_object_states(int log_level)
|
||||
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
|
||||
std::to_string(loc.osd_num)+
|
||||
(st.replicated ? "" : "("+std::to_string(loc.role)+")")+
|
||||
(loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
|
||||
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
|
||||
(loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
|
||||
(loc.outdated ? "(old)" : "");
|
||||
}
|
||||
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
|
||||
}
|
||||
@@ -468,7 +456,7 @@ void pg_t::calc_object_states(int log_level)
|
||||
void pg_t::print_state()
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
(state & PG_STARTING) ? "starting" : "",
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
(state & PG_PEERING) ? "peering" : "",
|
||||
@@ -477,15 +465,12 @@ void pg_t::print_state()
|
||||
(state & PG_REPEERING) ? "repeering" : "",
|
||||
(state & PG_STOPPING) ? "stopping" : "",
|
||||
(state & PG_DEGRADED) ? " + degraded" : "",
|
||||
(state & PG_HAS_INCONSISTENT) ? " + has_inconsistent" : "",
|
||||
(state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
|
||||
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
||||
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
|
||||
(state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
|
||||
(state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
|
||||
(state & PG_HAS_INVALID) ? " + has_invalid" : "",
|
||||
(state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
|
||||
(state & PG_SCRUBBING) ? " + scrubbing" : "",
|
||||
total_count
|
||||
);
|
||||
}
|
||||
|
@@ -17,7 +17,7 @@ struct pg_obj_loc_t
|
||||
{
|
||||
uint64_t role;
|
||||
osd_num_t osd_num;
|
||||
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
|
||||
bool outdated;
|
||||
};
|
||||
|
||||
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
|
||||
@@ -30,7 +30,6 @@ struct pg_osd_set_state_t
|
||||
pg_osd_set_t osd_set;
|
||||
uint64_t state = 0;
|
||||
uint64_t object_count = 0;
|
||||
uint64_t ref_count = 0;
|
||||
};
|
||||
|
||||
struct pg_list_result_t
|
||||
@@ -92,8 +91,6 @@ struct pg_t
|
||||
// target history and all potential peers
|
||||
std::vector<std::vector<osd_num_t>> target_history;
|
||||
std::vector<osd_num_t> all_peers;
|
||||
// next scrub time
|
||||
uint64_t next_scrub = 0;
|
||||
bool history_changed = false;
|
||||
// peer list from the last peering event
|
||||
std::vector<osd_num_t> cur_peers;
|
||||
@@ -109,8 +106,7 @@ struct pg_t
|
||||
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
||||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
|
||||
uint64_t corrupted_count;
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> inconsistent_objects, incomplete_objects, misplaced_objects, degraded_objects;
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
|
||||
std::map<obj_piece_id_t, flush_action_t> flush_actions;
|
||||
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
|
||||
btree::btree_map<object_id, uint64_t> ver_override;
|
||||
@@ -120,16 +116,15 @@ struct pg_t
|
||||
int inflight = 0; // including write_queue
|
||||
std::multimap<object_id, osd_op_t*> write_queue;
|
||||
|
||||
pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
|
||||
void calc_object_states(int log_level);
|
||||
void print_state();
|
||||
};
|
||||
|
||||
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
||||
{
|
||||
return a.loc_bad < b.loc_bad ||
|
||||
a.loc_bad == b.loc_bad && a.role < b.role ||
|
||||
a.loc_bad == b.loc_bad && a.role == b.role && a.osd_num < b.osd_num;
|
||||
return a.outdated < b.outdated ||
|
||||
a.outdated == b.outdated && a.role < b.role ||
|
||||
a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
|
||||
}
|
||||
|
||||
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
|
||||
|
@@ -52,9 +52,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
finish_op(cur_op, -EINVAL);
|
||||
return false;
|
||||
}
|
||||
// Scrub is similar to r/w, so it's also handled here
|
||||
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||
&& cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
|
||||
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
|
||||
int chain_size = 0;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
||||
{
|
||||
@@ -92,8 +90,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
chain_size * (
|
||||
// - copy of the chain
|
||||
sizeof(inode_t) +
|
||||
// - object states for every chain item
|
||||
sizeof(void*) +
|
||||
// - bitmap buffers for chained read
|
||||
stripe_count * clean_entry_bitmap_size +
|
||||
// - 'missing' flags for chained reads
|
||||
@@ -121,8 +117,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
{
|
||||
op_data->read_chain = (inode_t*)data_buf;
|
||||
data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
|
||||
op_data->chain_states = (pg_osd_set_state_t**)data_buf;
|
||||
data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
|
||||
op_data->snapshot_bitmaps = data_buf;
|
||||
data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
|
||||
op_data->missing_flags = (uint8_t*)data_buf;
|
||||
@@ -137,7 +131,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
inode_it->second.parent_id != cur_op->req.rw.inode)
|
||||
{
|
||||
op_data->read_chain[chain_num++] = inode_it->second.parent_id;
|
||||
op_data->chain_states[chain_num++] = NULL;
|
||||
inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
|
||||
}
|
||||
}
|
||||
@@ -145,12 +138,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
|
||||
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||
{
|
||||
*object_state = NULL;
|
||||
return pg.cur_set.data();
|
||||
return def;
|
||||
}
|
||||
auto st_it = pg.incomplete_objects.find(oid);
|
||||
if (st_it != pg.incomplete_objects.end())
|
||||
@@ -171,7 +164,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t
|
||||
return st_it->second->read_target.data();
|
||||
}
|
||||
*object_state = NULL;
|
||||
return pg.cur_set.data();
|
||||
return def;
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
@@ -190,7 +183,6 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
goto resume_1;
|
||||
else if (op_data->st == 2)
|
||||
goto resume_2;
|
||||
resume_0:
|
||||
cur_op->reply.rw.bitmap_len = 0;
|
||||
{
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
@@ -214,17 +206,15 @@ resume_0:
|
||||
// Determine version
|
||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
// PG may have degraded or misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
op_data->prev_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
// PG may be degraded or have misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
}
|
||||
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Fast happy-path
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED &&
|
||||
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||
{
|
||||
finish_op(cur_op, -EIO);
|
||||
return;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||
op_data->st = 1;
|
||||
@@ -250,14 +240,6 @@ resume_1:
|
||||
resume_2:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// I/O or checksum error
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
||||
goto resume_0;
|
||||
}
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
@@ -296,284 +278,10 @@ resume_2:
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
}
|
||||
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
|
||||
{
|
||||
pg_osd_set_state_t *object_state = NULL;
|
||||
get_object_osd_set(pg, oid, &object_state);
|
||||
if (prev_object_state != object_state)
|
||||
{
|
||||
// Object state changed in between by a parallel I/O operation, skip marking as failed
|
||||
if (ref)
|
||||
{
|
||||
deref_object_state(pg, &prev_object_state, ref);
|
||||
if (object_state)
|
||||
object_state->ref_count++;
|
||||
}
|
||||
return object_state;
|
||||
}
|
||||
pg_osd_set_t corrupted_set;
|
||||
if (object_state)
|
||||
{
|
||||
corrupted_set = object_state->osd_set;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < pg.cur_set.size(); i++)
|
||||
{
|
||||
corrupted_set.push_back((pg_obj_loc_t){
|
||||
.role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
|
||||
.osd_num = pg.cur_set[i],
|
||||
});
|
||||
}
|
||||
}
|
||||
// Mark object chunk(s) as corrupted
|
||||
int changes = 0;
|
||||
for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
|
||||
{
|
||||
auto & chunk = *chunk_it;
|
||||
if (stripes[chunk.role].osd_num == chunk.osd_num)
|
||||
{
|
||||
if (stripes[chunk.role].not_exists)
|
||||
{
|
||||
changes++;
|
||||
corrupted_set.erase(chunk_it, chunk_it+1);
|
||||
continue;
|
||||
}
|
||||
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad = LOC_CORRUPTED;
|
||||
}
|
||||
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
|
||||
(chunk.loc_bad & LOC_CORRUPTED))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_CORRUPTED;
|
||||
}
|
||||
}
|
||||
if (inconsistent && !chunk.loc_bad)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_INCONSISTENT;
|
||||
}
|
||||
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_INCONSISTENT;
|
||||
}
|
||||
chunk_it++;
|
||||
}
|
||||
if (!changes)
|
||||
{
|
||||
// No chunks newly marked as corrupted - object is already marked or moved
|
||||
return object_state;
|
||||
}
|
||||
int old_pg_state = pg.state;
|
||||
if (object_state)
|
||||
{
|
||||
remove_object_from_state(oid, &object_state, pg, false);
|
||||
deref_object_state(pg, &object_state, ref);
|
||||
}
|
||||
// Insert object into the new state and retry
|
||||
object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
|
||||
if (ref)
|
||||
{
|
||||
object_state->ref_count++;
|
||||
}
|
||||
return object_state;
|
||||
}
|
||||
|
||||
pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
||||
uint64_t old_pg_state, int log_at_level)
|
||||
{
|
||||
// Object state will be calculated from <osd_set>
|
||||
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0,
|
||||
n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0;
|
||||
for (auto & chunk: osd_set)
|
||||
{
|
||||
if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size))
|
||||
{
|
||||
n_invalid++;
|
||||
}
|
||||
else if (chunk.loc_bad & LOC_OUTDATED)
|
||||
{
|
||||
n_outdated++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (chunk.loc_bad & LOC_INCONSISTENT)
|
||||
{
|
||||
n_inconsistent++;
|
||||
}
|
||||
if (chunk.loc_bad & LOC_CORRUPTED)
|
||||
{
|
||||
n_corrupted++;
|
||||
}
|
||||
else if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
n_roles = 1;
|
||||
int i;
|
||||
for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {}
|
||||
if (i == pg.cur_set.size())
|
||||
{
|
||||
n_misplaced++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!(has_roles & (1 << chunk.role)))
|
||||
{
|
||||
n_roles++;
|
||||
has_roles |= (1 << chunk.role);
|
||||
}
|
||||
if (pg.cur_set[chunk.role] != chunk.osd_num)
|
||||
{
|
||||
n_misplaced++;
|
||||
}
|
||||
}
|
||||
n_copies++;
|
||||
}
|
||||
}
|
||||
uint64_t obj_state = 0;
|
||||
int pg_state_bits = 0;
|
||||
if (n_corrupted > 0)
|
||||
{
|
||||
this->corrupted_objects++;
|
||||
pg.corrupted_count++;
|
||||
obj_state |= OBJ_CORRUPTED;
|
||||
pg_state_bits |= PG_HAS_CORRUPTED;
|
||||
}
|
||||
if (n_invalid > 0 || n_inconsistent > 0)
|
||||
{
|
||||
this->inconsistent_objects++;
|
||||
obj_state |= OBJ_INCONSISTENT;
|
||||
pg_state_bits |= PG_HAS_INCONSISTENT;
|
||||
}
|
||||
else if (n_roles < pg.pg_data_size)
|
||||
{
|
||||
this->incomplete_objects++;
|
||||
obj_state |= OBJ_INCOMPLETE;
|
||||
pg_state_bits = PG_HAS_INCOMPLETE;
|
||||
}
|
||||
else if (n_roles < pg.pg_cursize)
|
||||
{
|
||||
this->degraded_objects++;
|
||||
obj_state |= OBJ_DEGRADED;
|
||||
pg_state_bits = PG_HAS_DEGRADED;
|
||||
}
|
||||
else if (n_misplaced > 0 || n_outdated > 0)
|
||||
{
|
||||
this->misplaced_objects++;
|
||||
obj_state |= OBJ_MISPLACED;
|
||||
pg_state_bits = PG_HAS_MISPLACED;
|
||||
}
|
||||
if (this->log_level >= log_at_level)
|
||||
{
|
||||
printf("Marking object %lx:%lx ", oid.inode, oid.stripe);
|
||||
for (int i = 0, j = 0; i < object_state_bit_count; i++)
|
||||
{
|
||||
if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
|
||||
{
|
||||
printf((j++) ? "+%s" : "%s", object_state_names[i]);
|
||||
}
|
||||
}
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
printf(": %lu copies available", n_copies);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(": %lu parts / %lu copies available", n_roles, n_copies);
|
||||
}
|
||||
if (n_invalid > 0)
|
||||
{
|
||||
printf(", %lu invalid", n_invalid);
|
||||
}
|
||||
if (n_outdated > 0)
|
||||
{
|
||||
printf(", %lu outdated", n_outdated);
|
||||
}
|
||||
if (n_misplaced > 0)
|
||||
{
|
||||
printf(", %lu misplaced", n_misplaced);
|
||||
}
|
||||
if (n_corrupted > 0)
|
||||
{
|
||||
printf(", %lu corrupted", n_corrupted);
|
||||
}
|
||||
if (n_inconsistent > 0)
|
||||
{
|
||||
printf(", %lu inconsistent", n_inconsistent);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
pg.state |= pg_state_bits;
|
||||
if (pg.state != old_pg_state)
|
||||
{
|
||||
report_pg_state(pg);
|
||||
if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
|
||||
(old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||
{
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
|
||||
{
|
||||
// Restart recovery from degraded objects
|
||||
recovery_last_degraded = true;
|
||||
recovery_last_pg = {};
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
if (!obj_state)
|
||||
{
|
||||
// Object is clean
|
||||
return NULL;
|
||||
}
|
||||
// Insert object into the new state and retry
|
||||
return pg.add_object_to_state(oid, obj_state, osd_set);
|
||||
}
|
||||
|
||||
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
||||
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
|
||||
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
|
||||
{
|
||||
if (!*object_state)
|
||||
{
|
||||
return;
|
||||
}
|
||||
pg_osd_set_state_t *recheck_state = NULL;
|
||||
get_object_osd_set(pg, oid, &recheck_state);
|
||||
if (recheck_state != *object_state)
|
||||
{
|
||||
recheck_state->ref_count++;
|
||||
(*object_state)->ref_count--;
|
||||
*object_state = recheck_state;
|
||||
return;
|
||||
}
|
||||
bool changed = false;
|
||||
(*object_state)->object_count--;
|
||||
if ((*object_state)->state & OBJ_CORRUPTED)
|
||||
{
|
||||
this->corrupted_objects--;
|
||||
pg.corrupted_count--;
|
||||
if (!pg.corrupted_count)
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_CORRUPTED;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
if ((*object_state)->state & OBJ_INCONSISTENT)
|
||||
{
|
||||
this->inconsistent_objects--;
|
||||
pg.inconsistent_objects.erase(oid);
|
||||
if (!pg.inconsistent_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_INCONSISTENT;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
else if ((*object_state)->state & OBJ_INCOMPLETE)
|
||||
if (object_state->state & OBJ_INCOMPLETE)
|
||||
{
|
||||
// Successful write means that object is not incomplete anymore
|
||||
this->incomplete_objects--;
|
||||
@@ -581,52 +289,41 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
|
||||
if (!pg.incomplete_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
|
||||
changed = true;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
else if ((*object_state)->state & OBJ_DEGRADED)
|
||||
else if (object_state->state & OBJ_DEGRADED)
|
||||
{
|
||||
this->degraded_objects--;
|
||||
pg.degraded_objects.erase(oid);
|
||||
if (!pg.degraded_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_DEGRADED;
|
||||
changed = true;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
else if ((*object_state)->state & OBJ_MISPLACED)
|
||||
else if (object_state->state & OBJ_MISPLACED)
|
||||
{
|
||||
this->misplaced_objects--;
|
||||
pg.misplaced_objects.erase(oid);
|
||||
if (!pg.misplaced_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
||||
changed = true;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
|
||||
}
|
||||
if (changed && report)
|
||||
{
|
||||
report_pg_state(pg);
|
||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
|
||||
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (*object_state)
|
||||
if (*object_state && !(--(*object_state)->object_count))
|
||||
{
|
||||
if (deref)
|
||||
{
|
||||
(*object_state)->ref_count--;
|
||||
}
|
||||
if (!(*object_state)->object_count && !(*object_state)->ref_count)
|
||||
{
|
||||
pg.state_dict.erase((*object_state)->osd_set);
|
||||
*object_state = NULL;
|
||||
}
|
||||
pg.state_dict.erase((*object_state)->osd_set);
|
||||
*object_state = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -656,28 +353,21 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
|
||||
}
|
||||
resume_1:
|
||||
// Determine which OSDs contain this object and delete it
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
if (op_data->object_state)
|
||||
{
|
||||
op_data->object_state->ref_count++;
|
||||
}
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
// Submit 1 read to determine the actual version number
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||
op_data->prev_set = NULL;
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
// Check CAS version
|
||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
cur_op->reply.hdr.retval = -EINTR;
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
goto continue_others;
|
||||
@@ -693,7 +383,6 @@ resume_4:
|
||||
resume_5:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
@@ -706,8 +395,8 @@ resume_5:
|
||||
}
|
||||
else
|
||||
{
|
||||
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
pg.total_count--;
|
||||
cur_op->reply.hdr.retval = 0;
|
||||
|
@@ -9,7 +9,6 @@
|
||||
#define SUBMIT_READ 0
|
||||
#define SUBMIT_RMW_READ 1
|
||||
#define SUBMIT_WRITE 2
|
||||
#define SUBMIT_SCRUB_READ 3
|
||||
|
||||
struct unstable_osd_num_t
|
||||
{
|
||||
@@ -51,7 +50,6 @@ struct osd_primary_op_data_t
|
||||
// for read_bitmaps
|
||||
void *snapshot_bitmaps;
|
||||
inode_t *read_chain;
|
||||
pg_osd_set_state_t **chain_states;
|
||||
uint8_t *missing_flags;
|
||||
int chain_size;
|
||||
osd_chain_read_t *chain_reads;
|
||||
|
@@ -40,24 +40,10 @@ resume_3:
|
||||
resume_4:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// Handle corrupted reads and retry...
|
||||
check_corrupted_chained(pg, cur_op);
|
||||
free(cur_op->buf);
|
||||
cur_op->buf = NULL;
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
// FIXME: We can in theory retry only specific parts instead of the whole operation
|
||||
goto resume_1;
|
||||
}
|
||||
else
|
||||
{
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
send_chained_read_results(pg, cur_op);
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
@@ -145,7 +131,8 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
|
||||
object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
|
||||
auto vo_it = pg.ver_override.find(cur_oid);
|
||||
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
|
||||
pg_osd_set_state_t *object_state;
|
||||
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
osd_num_t read_target = 0;
|
||||
@@ -260,7 +247,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
osd_op_t *subop = op_data->subops+subop_idx;
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
// FIXME: Use the pre-allocated buffer
|
||||
assert(!subop->buf);
|
||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||
subop->req = (osd_any_op_t){
|
||||
.sec_read_bmp = {
|
||||
@@ -389,8 +375,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
op_data->chain_read_count = chain_reads.size();
|
||||
op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
|
||||
1, sizeof(osd_chain_read_t) * chain_reads.size()
|
||||
// FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
|
||||
// (but it's slightly harder to handle in send_chained_read_results())
|
||||
+ sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
|
||||
);
|
||||
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||
@@ -419,7 +403,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
uint64_t *cur_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
|
||||
pg_osd_set_state_t *object_state;
|
||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
||||
@@ -431,17 +416,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
}
|
||||
op_data->degraded = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||
if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
|
||||
{
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, -EIO);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
@@ -459,7 +433,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(!cur_op->buf);
|
||||
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
|
||||
void *cur_buf = cur_op->buf;
|
||||
for (int cri = 0; cri < chain_reads.size(); cri++)
|
||||
@@ -495,8 +468,12 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||
auto vo_it = pg.ver_override.find(cur_oid);
|
||||
uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||
uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
|
||||
uint64_t *cur_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
pg_osd_set_state_t *object_state;
|
||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
}
|
||||
int zero_read = -1;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
@@ -510,33 +487,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
||||
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||
(uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
|
||||
);
|
||||
for (int cri = 0; cri < op_data->chain_read_count; cri++)
|
||||
{
|
||||
object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||
osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
|
||||
bool corrupted = false;
|
||||
for (int i = 0; i < stripe_count; i++)
|
||||
{
|
||||
if (stripes[i].read_error)
|
||||
{
|
||||
corrupted = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (corrupted)
|
||||
{
|
||||
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
|
@@ -1,128 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <queue>
|
||||
#include "osd_primary.h"
|
||||
|
||||
struct unclean_list_t
|
||||
{
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*>::iterator it, end;
|
||||
uint64_t state_mask, state;
|
||||
};
|
||||
|
||||
struct desc_item_list_t
|
||||
{
|
||||
int alloc, size;
|
||||
osd_reply_describe_item_t *items;
|
||||
};
|
||||
|
||||
static void include_list(std::vector<unclean_list_t> & lists,
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> & from,
|
||||
osd_op_describe_t & desc, uint64_t state_mask, uint64_t state)
|
||||
{
|
||||
auto it = desc.min_inode || desc.min_offset ? from.lower_bound((object_id){
|
||||
.inode = desc.min_inode,
|
||||
.stripe = desc.min_offset,
|
||||
}) : from.begin();
|
||||
auto end_it = desc.max_inode || desc.max_offset ? from.upper_bound((object_id){
|
||||
.inode = desc.max_inode,
|
||||
.stripe = desc.max_offset,
|
||||
}) : from.end();
|
||||
lists.push_back((unclean_list_t){
|
||||
.it = it,
|
||||
.end = end_it,
|
||||
.state_mask = state_mask,
|
||||
.state = state,
|
||||
});
|
||||
}
|
||||
|
||||
struct obj_list_t
|
||||
{
|
||||
object_id oid;
|
||||
int list_id;
|
||||
};
|
||||
|
||||
static inline bool operator < (const obj_list_t & a, const obj_list_t & b)
|
||||
{
|
||||
return b.oid < a.oid;
|
||||
}
|
||||
|
||||
static void scan_lists(std::vector<unclean_list_t> & lists, uint64_t limit, desc_item_list_t & res)
|
||||
{
|
||||
if (limit > 1048576)
|
||||
{
|
||||
limit = 1048576;
|
||||
}
|
||||
std::priority_queue<obj_list_t> min;
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i].it != lists[i].end)
|
||||
{
|
||||
min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
|
||||
}
|
||||
}
|
||||
while (min.size() && (!limit || res.size < limit))
|
||||
{
|
||||
auto i = min.top().list_id;
|
||||
min.pop();
|
||||
for (auto & chunk: lists[i].it->second->osd_set)
|
||||
{
|
||||
if (res.size >= res.alloc)
|
||||
{
|
||||
res.alloc = !res.alloc ? 128 : (res.alloc*2);
|
||||
res.items = (osd_reply_describe_item_t*)realloc_or_die(res.items, res.alloc * sizeof(osd_reply_describe_item_t));
|
||||
}
|
||||
res.items[res.size++] = (osd_reply_describe_item_t){
|
||||
.inode = lists[i].it->first.inode,
|
||||
.stripe = lists[i].it->first.stripe,
|
||||
.role = (uint32_t)chunk.role,
|
||||
.loc_bad = chunk.loc_bad,
|
||||
.osd_num = chunk.osd_num,
|
||||
};
|
||||
}
|
||||
lists[i].it++;
|
||||
if (lists[i].it != lists[i].end)
|
||||
{
|
||||
min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Describe unclean objects
|
||||
void osd_t::continue_primary_describe(osd_op_t *cur_op)
|
||||
{
|
||||
auto & desc = cur_op->req.describe;
|
||||
if (!desc.object_state)
|
||||
desc.object_state = ~desc.object_state;
|
||||
std::vector<unclean_list_t> lists;
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
auto & pg = pg_it->second;
|
||||
if (desc.object_state & OBJ_INCONSISTENT)
|
||||
include_list(lists, pg.inconsistent_objects, desc, 0, 0);
|
||||
if (desc.object_state & OBJ_CORRUPTED)
|
||||
{
|
||||
if (!(desc.object_state & OBJ_INCOMPLETE))
|
||||
include_list(lists, pg.incomplete_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
|
||||
if (!(desc.object_state & OBJ_DEGRADED))
|
||||
include_list(lists, pg.degraded_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
|
||||
if (!(desc.object_state & OBJ_MISPLACED))
|
||||
include_list(lists, pg.misplaced_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
|
||||
}
|
||||
uint64_t skip_corrupted = !(desc.object_state & OBJ_CORRUPTED) ? OBJ_CORRUPTED : 0;
|
||||
if (desc.object_state & OBJ_INCOMPLETE)
|
||||
include_list(lists, pg.incomplete_objects, desc, skip_corrupted, 0);
|
||||
if (desc.object_state & OBJ_DEGRADED)
|
||||
include_list(lists, pg.degraded_objects, desc, skip_corrupted, 0);
|
||||
if (desc.object_state & OBJ_MISPLACED)
|
||||
include_list(lists, pg.misplaced_objects, desc, skip_corrupted, 0);
|
||||
}
|
||||
desc_item_list_t res = {};
|
||||
scan_lists(lists, desc.limit, res);
|
||||
assert(!cur_op->buf);
|
||||
cur_op->buf = res.items;
|
||||
cur_op->reply.describe.result_bytes = res.size * sizeof(osd_reply_describe_item_t);
|
||||
if (res.items)
|
||||
cur_op->iov.push_back(res.items, res.size * sizeof(osd_reply_describe_item_t));
|
||||
finish_op(cur_op, res.size);
|
||||
}
|
@@ -9,7 +9,6 @@ void osd_t::autosync()
|
||||
{
|
||||
autosync_op = new osd_op_t();
|
||||
autosync_op->op_type = OSD_OP_IN;
|
||||
autosync_op->peer_fd = -1;
|
||||
autosync_op->req = (osd_any_op_t){
|
||||
.sync = {
|
||||
.header = {
|
||||
@@ -81,11 +80,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
||||
free(cur_op->op_data);
|
||||
cur_op->op_data = NULL;
|
||||
}
|
||||
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
||||
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
||||
cur_op->reply.hdr.retval = retval;
|
||||
if (cur_op->peer_fd == -1)
|
||||
if (!cur_op->peer_fd)
|
||||
{
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
|
||||
@@ -96,6 +91,10 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
||||
auto cl_it = msgr.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != msgr.clients.end())
|
||||
{
|
||||
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
||||
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
||||
cur_op->reply.hdr.retval = retval;
|
||||
msgr.outbox_push(cur_op);
|
||||
}
|
||||
else
|
||||
@@ -143,50 +142,43 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
osd_num_t role_osd_num = osd_set[role];
|
||||
int stripe_num = rep ? 0 : role;
|
||||
osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num);
|
||||
if (role_osd_num != 0)
|
||||
{
|
||||
int stripe_num = rep ? 0 : role;
|
||||
osd_op_t *subop = op_data->subops + i;
|
||||
uint32_t subop_len = wr
|
||||
? si->write_end - si->write_start
|
||||
: si->read_end - si->read_start;
|
||||
if (!wr && si->read_end == UINT32_MAX)
|
||||
? stripes[stripe_num].write_end - stripes[stripe_num].write_start
|
||||
: stripes[stripe_num].read_end - stripes[stripe_num].read_start;
|
||||
if (!wr && stripes[stripe_num].read_end == UINT32_MAX)
|
||||
{
|
||||
subop_len = 0;
|
||||
}
|
||||
si->osd_num = role_osd_num;
|
||||
si->read_error = false;
|
||||
subop->bitmap = si->bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
// Using rmw_buf to pass pointer to stripes. Dirty but should work
|
||||
subop->rmw_buf = si;
|
||||
if (role_osd_num == this->osd_num)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
|
||||
subop->op_type = (uint64_t)cur_op;
|
||||
subop->bs_op = new blockstore_op_t((blockstore_op_t){
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->bs_op = new blockstore_op_t({
|
||||
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
|
||||
.callback = [subop, this](blockstore_op_t *bs_subop)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
{ {
|
||||
.oid = (object_id){
|
||||
.inode = inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? si->write_start : si->read_start,
|
||||
.len = subop_len,
|
||||
} },
|
||||
.buf = wr ? si->write_buf : si->read_buf,
|
||||
.bitmap = si->bmp_buf,
|
||||
.oid = {
|
||||
.inode = inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
|
||||
.len = subop_len,
|
||||
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
|
||||
.bitmap = stripes[stripe_num].bmp_buf,
|
||||
});
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
@@ -200,6 +192,8 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->req.sec_rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
@@ -211,7 +205,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? si->write_start : si->read_start,
|
||||
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
|
||||
.len = subop_len,
|
||||
.attr_len = wr ? clean_entry_bitmap_size : 0,
|
||||
};
|
||||
@@ -224,16 +218,16 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
#endif
|
||||
if (wr)
|
||||
{
|
||||
if (si->write_end > si->write_start)
|
||||
if (stripes[stripe_num].write_end > stripes[stripe_num].write_start)
|
||||
{
|
||||
subop->iov.push_back(si->write_buf, si->write_end - si->write_start);
|
||||
subop->iov.push_back(stripes[stripe_num].write_buf, stripes[stripe_num].write_end - stripes[stripe_num].write_start);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (subop_len > 0)
|
||||
{
|
||||
subop->iov.push_back(si->read_buf, subop_len);
|
||||
subop->iov.push_back(stripes[stripe_num].read_buf, subop_len);
|
||||
}
|
||||
}
|
||||
subop->callback = [cur_op, this](osd_op_t *subop)
|
||||
@@ -256,10 +250,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
}
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
si->osd_num = 0;
|
||||
}
|
||||
}
|
||||
return i-subop_idx;
|
||||
}
|
||||
@@ -344,45 +334,14 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
else
|
||||
expected = 0;
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (retval == -ENOENT && opcode == OSD_OP_SEC_READ)
|
||||
{
|
||||
// ENOENT is not an error for almost all reads, except scrub
|
||||
retval = expected;
|
||||
memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
|
||||
((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
|
||||
}
|
||||
if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
|
||||
{
|
||||
uint64_t version = subop->reply.sec_rw.version;
|
||||
#ifdef OSD_DEBUG
|
||||
uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
|
||||
? msgr.clients[subop->peer_fd]->osd_num : osd_num;
|
||||
printf("subop %s %lx:%lx from osd %lu: version = %lu\n", osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, peer_osd, version);
|
||||
#endif
|
||||
if (op_data->fact_ver != UINT64_MAX)
|
||||
{
|
||||
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "different fact_versions returned from %s subops: %lu vs %lu\n",
|
||||
osd_op_names[opcode], version, op_data->fact_ver
|
||||
);
|
||||
retval = -ERANGE;
|
||||
}
|
||||
else
|
||||
op_data->fact_ver = version;
|
||||
}
|
||||
}
|
||||
if (retval != expected)
|
||||
{
|
||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||
{
|
||||
printf(
|
||||
subop->peer_fd >= 0
|
||||
? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
|
||||
: "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
|
||||
"%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
|
||||
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
|
||||
retval, expected, subop->peer_fd
|
||||
subop->peer_fd, retval, expected
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -392,33 +351,43 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
osd_op_names[opcode], subop->peer_fd, retval, expected
|
||||
);
|
||||
}
|
||||
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
|
||||
{
|
||||
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
|
||||
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
|
||||
}
|
||||
subop->rmw_buf = NULL;
|
||||
// Error priority: ENOSPC and others > EIO > EDOM > EPIPE
|
||||
if (op_data->errcode == 0 ||
|
||||
retval == -EIO && (op_data->errcode == -EDOM || op_data->errcode == -EPIPE) ||
|
||||
retval == -EDOM && (op_data->errcode == -EPIPE) ||
|
||||
retval != -EIO && retval != -EDOM && retval != -EPIPE)
|
||||
// Error priority: EIO > ENOSPC > EPIPE
|
||||
if (op_data->errcode == 0 || retval == -EIO ||
|
||||
retval == -ENOSPC && op_data->errcode == -EPIPE)
|
||||
{
|
||||
op_data->errcode = retval;
|
||||
}
|
||||
op_data->errors++;
|
||||
if (subop->peer_fd >= 0 && retval != -EDOM && retval != -ERANGE &&
|
||||
(retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
|
||||
(retval != -EIO || opcode != OSD_OP_SEC_READ))
|
||||
if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
|
||||
retval != -ENOSPC))
|
||||
{
|
||||
// Drop connection on unexpected errors
|
||||
// Drop connection on any error expect ENOSPC
|
||||
msgr.stop_client(subop->peer_fd);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
subop->rmw_buf = NULL;
|
||||
op_data->done++;
|
||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||
{
|
||||
uint64_t version = subop->reply.sec_rw.version;
|
||||
#ifdef OSD_DEBUG
|
||||
uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
|
||||
? msgr.clients[subop->peer_fd]->osd_num : osd_num;
|
||||
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
|
||||
#endif
|
||||
if (op_data->fact_ver != UINT64_MAX)
|
||||
{
|
||||
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"different fact_versions returned from "+std::string(osd_op_names[opcode])+
|
||||
" subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
|
||||
);
|
||||
}
|
||||
op_data->fact_ver = version;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((op_data->errors + op_data->done) >= op_data->n_subops)
|
||||
{
|
||||
@@ -441,10 +410,6 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
{
|
||||
continue_primary_del(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||
{
|
||||
continue_primary_scrub(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("BUG: unknown opcode");
|
||||
@@ -533,10 +498,8 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
{ {
|
||||
.oid = chunk.oid,
|
||||
.version = chunk.version,
|
||||
} },
|
||||
.oid = chunk.oid,
|
||||
.version = chunk.version,
|
||||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
}
|
||||
@@ -650,9 +613,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
{
|
||||
.len = (uint32_t)stab_osd.len,
|
||||
},
|
||||
.len = (uint32_t)stab_osd.len,
|
||||
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
|
||||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
|
@@ -58,13 +58,7 @@ resume_1:
|
||||
// Determine blocks to read and write
|
||||
// Missing chunks are allowed to be overwritten even in incomplete objects
|
||||
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
if (op_data->object_state)
|
||||
{
|
||||
// Protect object_state from being freed by a parallel read operation changing it
|
||||
op_data->object_state->ref_count++;
|
||||
}
|
||||
retry_1:
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Simplified algorithm
|
||||
@@ -74,12 +68,6 @@ retry_1:
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
if (op_data->object_state->state & OBJ_INCOMPLETE)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete (corrupted) object
|
||||
cur_op->reply.hdr.retval = -EIO;
|
||||
goto continue_others;
|
||||
}
|
||||
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
||||
op_data->stripes[0].read_start = 0;
|
||||
op_data->stripes[0].read_end = bs_block_size;
|
||||
@@ -98,61 +86,19 @@ retry_1:
|
||||
}
|
||||
}
|
||||
// Read required blocks
|
||||
{
|
||||
if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||
{
|
||||
// Allow to read version number (just version number!) from corrupted chunks
|
||||
// to allow full overwrite of a corrupted object
|
||||
bool found = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
osd_num_t corrupted_target[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
corrupted_target[role] = 0;
|
||||
}
|
||||
for (auto & loc: op_data->object_state->osd_set)
|
||||
{
|
||||
if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
|
||||
{
|
||||
corrupted_target[loc.role] = loc.osd_num;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
|
||||
goto resume_2;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||
}
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// Mark object corrupted and retry
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
|
||||
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
|
||||
goto retry_1;
|
||||
}
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
// Check CAS version
|
||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
cur_op->reply.hdr.retval = -EINTR;
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
goto continue_others;
|
||||
@@ -236,7 +182,6 @@ resume_10:
|
||||
// Recheck PG state after reporting history - maybe it's already stopping/restarting
|
||||
if (pg.state & (PG_STOPPING|PG_REPEERING))
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
|
||||
return;
|
||||
}
|
||||
@@ -257,7 +202,6 @@ resume_5:
|
||||
// to overwrite the same version number which will result in EEXIST.
|
||||
// To fix it, we should mark the object as degraded for replicas,
|
||||
// and rollback successful part updates in case of EC.
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
@@ -266,7 +210,7 @@ resume_5:
|
||||
// We must forget the unclean state of the object before deleting it
|
||||
// so the next reads don't accidentally read a deleted version
|
||||
// And it should be done at the same time as the removal of the version override
|
||||
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
resume_6:
|
||||
@@ -321,12 +265,12 @@ resume_7:
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
else
|
||||
{
|
||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
if (op_data->n_subops > 0)
|
||||
{
|
||||
resume_8:
|
||||
|
177
src/osd_rmw.cpp
177
src/osd_rmw.cpp
@@ -1084,180 +1084,3 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||
}
|
||||
calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
|
||||
}
|
||||
|
||||
// Generate subsets of k items each in {0..n-1}
|
||||
static bool first_combination(int *subset, int k, int n)
|
||||
{
|
||||
if (k > n)
|
||||
return false;
|
||||
for (int i = 0; i < k; i++)
|
||||
subset[i] = i;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool next_combination(int *subset, int k, int n)
|
||||
{
|
||||
int pos = k-1;
|
||||
while (true)
|
||||
{
|
||||
subset[pos]++;
|
||||
if (subset[pos] >= n-(k-1-pos))
|
||||
{
|
||||
if (pos == 0)
|
||||
return false;
|
||||
pos--;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
for (pos++; pos < k; pos++)
|
||||
{
|
||||
subset[pos] = subset[pos-1]+1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static int c_n_k(int n, int k)
|
||||
{
|
||||
int c = 1;
|
||||
for (int i = n; i > k; i--)
|
||||
c *= i;
|
||||
for (int i = 2; i <= (n-k); i++)
|
||||
c /= i;
|
||||
return c;
|
||||
}
|
||||
|
||||
std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
|
||||
uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce)
|
||||
{
|
||||
std::vector<int> found_valid;
|
||||
int cur_live[pg_size], live_count = 0, exists_count = 0;
|
||||
osd_num_t fake_osd_set[pg_size];
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
{
|
||||
if (!stripes[role].missing)
|
||||
{
|
||||
if (!stripes[role].not_exists)
|
||||
exists_count++;
|
||||
cur_live[live_count++] = role;
|
||||
fake_osd_set[role] = role+1;
|
||||
}
|
||||
}
|
||||
if (live_count <= pg_minsize)
|
||||
{
|
||||
return std::vector<int>();
|
||||
}
|
||||
if (exists_count <= pg_minsize)
|
||||
{
|
||||
// Special case: user manually deleted some chunks
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
if (!stripes[role].missing && !stripes[role].not_exists)
|
||||
found_valid.push_back(role);
|
||||
return found_valid;
|
||||
}
|
||||
// Try to locate errors using brute force if there isn't too many combinations
|
||||
osd_rmw_stripe_t brute_stripes[pg_size];
|
||||
int out_count = live_count-pg_minsize;
|
||||
bool brute_force = out_count > 1 && c_n_k(live_count-1, out_count-1) <= max_bruteforce;
|
||||
int subset[pg_minsize], outset[out_count];
|
||||
// Select all combinations with items except the last one (== anything to compare)
|
||||
first_combination(subset, pg_minsize, live_count-1);
|
||||
uint8_t *tmp_buf = (uint8_t*)malloc_or_die(pg_size*chunk_size);
|
||||
do
|
||||
{
|
||||
memcpy(brute_stripes, stripes, sizeof(osd_rmw_stripe_t)*pg_size);
|
||||
int i = 0, j = 0, k = 0;
|
||||
for (; i < pg_minsize; i++, j++)
|
||||
while (j < subset[i])
|
||||
outset[k++] = j++;
|
||||
while (j < pg_size)
|
||||
outset[k++] = j++;
|
||||
for (int i = 0; i < out_count; i++)
|
||||
{
|
||||
brute_stripes[cur_live[outset[i]]].missing = true;
|
||||
brute_stripes[cur_live[outset[i]]].read_buf = tmp_buf+cur_live[outset[i]]*chunk_size;
|
||||
}
|
||||
for (int i = 0; i < pg_minsize; i++)
|
||||
{
|
||||
brute_stripes[i].write_buf = brute_stripes[i].read_buf;
|
||||
brute_stripes[i].req_start = 0;
|
||||
brute_stripes[i].req_end = chunk_size;
|
||||
}
|
||||
for (int i = pg_minsize; i < pg_size; i++)
|
||||
{
|
||||
brute_stripes[i].write_buf = tmp_buf+i*chunk_size;
|
||||
}
|
||||
if (is_xor)
|
||||
{
|
||||
assert(pg_size == pg_minsize+1);
|
||||
reconstruct_stripes_xor(brute_stripes, pg_size, bitmap_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
reconstruct_stripes_ec(brute_stripes, pg_size, pg_minsize, bitmap_size);
|
||||
calc_rmw_parity_ec(brute_stripes, pg_size, pg_minsize, fake_osd_set, fake_osd_set, chunk_size, bitmap_size);
|
||||
}
|
||||
for (int i = pg_minsize; i < pg_size; i++)
|
||||
{
|
||||
brute_stripes[i].read_buf = brute_stripes[i].write_buf;
|
||||
}
|
||||
int valid_count = 0;
|
||||
for (int i = 0; i < out_count; i++)
|
||||
{
|
||||
if (memcmp(brute_stripes[cur_live[outset[i]]].read_buf,
|
||||
stripes[cur_live[outset[i]]].read_buf, chunk_size) == 0)
|
||||
{
|
||||
brute_stripes[cur_live[outset[i]]].missing = false;
|
||||
valid_count++;
|
||||
}
|
||||
}
|
||||
if (valid_count > 0)
|
||||
{
|
||||
if (found_valid.size())
|
||||
{
|
||||
// Check if we found the same set from the different point of view,
|
||||
// like 1 2 3 -> valid 4 5 and 1 3 4 -> valid 2 5
|
||||
for (int i = 0, j = 0; i < pg_size; i++)
|
||||
{
|
||||
if (!brute_stripes[i].missing)
|
||||
{
|
||||
if (j >= found_valid.size() || found_valid[j] != i)
|
||||
{
|
||||
// Ambiguity: we found multiple valid sets and don't know which one is correct
|
||||
found_valid.clear();
|
||||
break;
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
if (!found_valid.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
if (!brute_stripes[i].missing)
|
||||
{
|
||||
found_valid.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (valid_count == out_count)
|
||||
{
|
||||
// All chunks are good
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!brute_force)
|
||||
{
|
||||
// Do not attempt brute force if there are too many combinations because even
|
||||
// if we find it we won't be able to check that it's the only good one
|
||||
break;
|
||||
}
|
||||
} while (out_count > 1 && next_combination(subset, pg_minsize, live_count-1));
|
||||
free(tmp_buf);
|
||||
return found_valid;
|
||||
}
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include "object_id.h"
|
||||
#include "osd_id.h"
|
||||
|
||||
@@ -27,10 +26,7 @@ struct osd_rmw_stripe_t
|
||||
// read_end=UINT32_MAX means to only read bitmap, but not data
|
||||
uint32_t read_start, read_end;
|
||||
uint32_t write_start, write_end;
|
||||
osd_num_t osd_num;
|
||||
bool missing: 1;
|
||||
bool read_error: 1;
|
||||
bool not_exists: 1;
|
||||
bool missing;
|
||||
};
|
||||
|
||||
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
|
||||
@@ -56,6 +52,3 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
||||
|
||||
void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size);
|
||||
|
||||
std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
|
||||
uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce);
|
||||
|
@@ -28,7 +28,6 @@ void test14();
|
||||
void test15(bool second);
|
||||
void test16();
|
||||
void test_recover_22_d2();
|
||||
void test_ec43_error_bruteforce();
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
@@ -65,8 +64,6 @@ int main(int narg, char *args[])
|
||||
test16();
|
||||
// Test 17
|
||||
test_recover_22_d2();
|
||||
// Error bruteforce
|
||||
test_ec43_error_bruteforce();
|
||||
// End
|
||||
printf("all ok\n");
|
||||
return 0;
|
||||
@@ -1109,72 +1106,3 @@ void test_recover_22_d2()
|
||||
// Done
|
||||
use_ec(4, 2, false);
|
||||
}
|
||||
|
||||
/***
|
||||
|
||||
EC 4+2 error location bruteforce
|
||||
|
||||
***/
|
||||
|
||||
static void assert_eq_vec(const std::vector<int> & b, const std::vector<int> & a)
|
||||
{
|
||||
printf("Expect [");
|
||||
for (int i = 0; i < a.size(); i++)
|
||||
printf(" %d", a[i]);
|
||||
printf(" ] have [");
|
||||
for (int i = 0; i < b.size(); i++)
|
||||
printf(" %d", b[i]);
|
||||
printf(" ]\n");
|
||||
assert(a == b);
|
||||
}
|
||||
|
||||
void test_ec43_error_bruteforce()
|
||||
{
|
||||
use_ec(7, 4, true);
|
||||
osd_num_t osd_set[7] = { 1, 2, 3, 4, 5, 6, 7 };
|
||||
osd_rmw_stripe_t stripes[7] = {};
|
||||
split_stripes(4, 4096, 0, 4096 * 4, stripes);
|
||||
uint8_t *write_buf = (uint8_t*)malloc_or_die(4096 * 7);
|
||||
set_pattern(write_buf+0*4096, 4096, PATTERN0);
|
||||
set_pattern(write_buf+1*4096, 4096, PATTERN1);
|
||||
set_pattern(write_buf+2*4096, 4096, PATTERN2);
|
||||
set_pattern(write_buf+3*4096, 4096, PATTERN3);
|
||||
uint8_t *rmw_buf = (uint8_t*)calc_rmw(write_buf, stripes, osd_set, 7, 4, 7, osd_set, 4096, 0);
|
||||
calc_rmw_parity_ec(stripes, 7, 4, osd_set, osd_set, 4096, 0);
|
||||
check_pattern(stripes[4].write_buf, 4096, PATTERN0^PATTERN1^PATTERN2^PATTERN3);
|
||||
check_pattern(stripes[5].write_buf, 4096, 0xfcee568ba36371ac); // 2nd EC chunk
|
||||
check_pattern(stripes[6].write_buf, 4096, 0x139274739ae6f387); // 3rd EC chunk
|
||||
memcpy(write_buf+4*4096, stripes[4].write_buf, 4096);
|
||||
memcpy(write_buf+5*4096, stripes[5].write_buf, 4096);
|
||||
memcpy(write_buf+6*4096, stripes[6].write_buf, 4096);
|
||||
// Try to locate errors
|
||||
for (int i = 0; i < 7; i++)
|
||||
{
|
||||
stripes[i].read_start = 0;
|
||||
stripes[i].read_end = 4096;
|
||||
stripes[i].read_buf = write_buf+i*4096;
|
||||
stripes[i].write_buf = NULL;
|
||||
}
|
||||
// All good chunks
|
||||
auto res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
assert_eq_vec(res, std::vector<int>({0, 1, 2, 3, 4, 5, 6}));
|
||||
// 1 missing chunk
|
||||
set_pattern(write_buf+1*4096, 4096, 0);
|
||||
res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 5, 6}));
|
||||
// 2 missing chunks
|
||||
set_pattern(write_buf+1*4096, 4096, 0);
|
||||
set_pattern(write_buf+5*4096, 4096, 0);
|
||||
res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 6}));
|
||||
// 3 missing chunks
|
||||
set_pattern(write_buf+1*4096, 4096, 0);
|
||||
set_pattern(write_buf+5*4096, 4096, 0);
|
||||
set_pattern(write_buf+6*4096, 4096, 0);
|
||||
res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
assert_eq_vec(res, std::vector<int>());
|
||||
// Done
|
||||
free(rmw_buf);
|
||||
free(write_buf);
|
||||
use_ec(7, 4, false);
|
||||
}
|
||||
|
@@ -1,623 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "osd_primary.h"
|
||||
|
||||
#define SELF_FD -1
|
||||
|
||||
void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
|
||||
{
|
||||
pool_id_t pool_id = pg_id.pool_id;
|
||||
pg_num_t pg_num = pg_id.pg_num;
|
||||
assert(!scrub_list_op);
|
||||
if (role_osd == this->osd_num)
|
||||
{
|
||||
// Self
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = 0;
|
||||
op->peer_fd = SELF_FD;
|
||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||
op->bs_op = new blockstore_op_t();
|
||||
op->bs_op->opcode = BS_OP_LIST;
|
||||
op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
|
||||
if (min_oid.inode != 0 || min_oid.stripe != 0)
|
||||
op->bs_op->min_oid = min_oid;
|
||||
else
|
||||
{
|
||||
op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
|
||||
op->bs_op->min_oid.stripe = 0;
|
||||
}
|
||||
op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||
op->bs_op->max_oid.stripe = UINT64_MAX;
|
||||
op->bs_op->list_stable_limit = scrub_list_limit;
|
||||
op->bs_op->pg_count = pg_counts[pool_id];
|
||||
op->bs_op->pg_number = pg_num-1;
|
||||
op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
|
||||
{
|
||||
scrub_list_op = NULL;
|
||||
if (op->bs_op->retval < 0)
|
||||
{
|
||||
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
|
||||
force_stop(1);
|
||||
return;
|
||||
}
|
||||
add_bs_subop_stats(op);
|
||||
scrub_cur_list = {
|
||||
.buf = (obj_ver_id*)op->bs_op->buf,
|
||||
.total_count = (uint64_t)op->bs_op->retval,
|
||||
.stable_count = op->bs_op->version,
|
||||
};
|
||||
delete op->bs_op;
|
||||
op->bs_op = NULL;
|
||||
delete op;
|
||||
continue_scrub();
|
||||
};
|
||||
scrub_list_op = op;
|
||||
bs->enqueue_op(op->bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = pg_num,
|
||||
.pg_count = pg_counts[pool_id],
|
||||
.pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
|
||||
.min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
|
||||
.max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
|
||||
.min_stripe = min_oid.stripe,
|
||||
.stable_limit = scrub_list_limit,
|
||||
},
|
||||
};
|
||||
op->callback = [this, role_osd](osd_op_t *op)
|
||||
{
|
||||
scrub_list_op = NULL;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
||||
int fail_fd = op->peer_fd;
|
||||
delete op;
|
||||
msgr.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
scrub_cur_list = {
|
||||
.buf = (obj_ver_id*)op->buf,
|
||||
.total_count = (uint64_t)op->reply.hdr.retval,
|
||||
.stable_count = op->reply.sec_list.stable_count,
|
||||
};
|
||||
// set op->buf to NULL so it doesn't get freed
|
||||
op->buf = NULL;
|
||||
delete op;
|
||||
continue_scrub();
|
||||
};
|
||||
scrub_list_op = op;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
}
|
||||
|
||||
int osd_t::pick_next_scrub(object_id & next_oid)
|
||||
{
|
||||
if (!pgs.size())
|
||||
{
|
||||
if (scrub_cur_list.buf)
|
||||
{
|
||||
free(scrub_cur_list.buf);
|
||||
scrub_cur_list = {};
|
||||
scrub_last_pg = {};
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
timespec tv_now;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||
bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
|
||||
// Restart scanning from the same PG as the last time
|
||||
auto pg_it = pgs.lower_bound(scrub_last_pg);
|
||||
if (pg_it == pgs.end() && rescan)
|
||||
{
|
||||
pg_it = pgs.begin();
|
||||
rescan = false;
|
||||
}
|
||||
while (pg_it != pgs.end())
|
||||
{
|
||||
if ((pg_it->second.state & PG_ACTIVE) && pg_it->second.next_scrub && pg_it->second.next_scrub < tv_now.tv_sec)
|
||||
{
|
||||
// Continue scrubbing from the next object
|
||||
if (scrub_last_pg == pg_it->first)
|
||||
{
|
||||
while (scrub_list_pos < scrub_cur_list.total_count)
|
||||
{
|
||||
auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
|
||||
oid.stripe &= ~STRIPE_MASK;
|
||||
scrub_list_pos++;
|
||||
if (recovery_ops.find(oid) == recovery_ops.end() &&
|
||||
scrub_ops.find(oid) == scrub_ops.end() &&
|
||||
pg_it->second.write_queue.find(oid) == pg_it->second.write_queue.end())
|
||||
{
|
||||
next_oid = oid;
|
||||
if (!(pg_it->second.state & PG_SCRUBBING))
|
||||
{
|
||||
// Currently scrubbing this PG
|
||||
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (scrub_last_pg == pg_it->first &&
|
||||
scrub_list_pos >= scrub_cur_list.total_count &&
|
||||
scrub_cur_list.stable_count < scrub_list_limit)
|
||||
{
|
||||
// End of the list, mark this PG as scrubbed and go to the next PG
|
||||
}
|
||||
else
|
||||
{
|
||||
// Continue listing
|
||||
object_id scrub_last_oid = {};
|
||||
if (scrub_last_pg == pg_it->first && scrub_cur_list.stable_count > 0)
|
||||
{
|
||||
scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
|
||||
scrub_last_oid.stripe++;
|
||||
}
|
||||
osd_num_t scrub_osd = 0;
|
||||
for (osd_num_t pg_osd: pg_it->second.cur_set)
|
||||
{
|
||||
if (pg_osd == this->osd_num || scrub_osd == 0)
|
||||
scrub_osd = pg_osd;
|
||||
}
|
||||
if (!(pg_it->second.state & PG_SCRUBBING))
|
||||
{
|
||||
// Currently scrubbing this PG
|
||||
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
if (scrub_cur_list.buf)
|
||||
{
|
||||
free(scrub_cur_list.buf);
|
||||
scrub_cur_list = {};
|
||||
scrub_list_pos = 0;
|
||||
}
|
||||
scrub_last_pg = pg_it->first;
|
||||
scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
|
||||
return 1;
|
||||
}
|
||||
if (pg_it->second.state & PG_SCRUBBING)
|
||||
{
|
||||
scrub_last_pg = {};
|
||||
pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
|
||||
pg_it->second.next_scrub = 0;
|
||||
pg_it->second.history_changed = true;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
// The list is definitely not needed anymore
|
||||
if (scrub_cur_list.buf)
|
||||
{
|
||||
free(scrub_cur_list.buf);
|
||||
scrub_cur_list = {};
|
||||
}
|
||||
}
|
||||
pg_it++;
|
||||
if (pg_it == pgs.end() && rescan)
|
||||
{
|
||||
// Scan one more time to guarantee that there are no PGs to scrub
|
||||
pg_it = pgs.begin();
|
||||
rescan = false;
|
||||
}
|
||||
}
|
||||
// Scanned all PGs - no more scrubs to do
|
||||
return 0;
|
||||
}
|
||||
|
||||
void osd_t::submit_scrub_op(object_id oid)
|
||||
{
|
||||
auto osd_op = new osd_op_t();
|
||||
osd_op->op_type = OSD_OP_OUT;
|
||||
osd_op->peer_fd = -1;
|
||||
osd_op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = 1,
|
||||
.opcode = OSD_OP_SCRUB,
|
||||
},
|
||||
.inode = oid.inode,
|
||||
.offset = oid.stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
|
||||
}
|
||||
osd_op->callback = [this](osd_op_t *osd_op)
|
||||
{
|
||||
object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
|
||||
if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
|
||||
{
|
||||
// Scrub error
|
||||
printf(
|
||||
"Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
|
||||
oid.inode, oid.stripe, INODE_POOL(oid.inode),
|
||||
map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
|
||||
osd_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (log_level > 2)
|
||||
{
|
||||
printf("Scrubbed %lx:%lx\n", oid.inode, oid.stripe);
|
||||
}
|
||||
delete osd_op;
|
||||
if (scrub_sleep_ms)
|
||||
{
|
||||
this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
|
||||
{
|
||||
scrub_ops.erase(oid);
|
||||
continue_scrub();
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
scrub_ops.erase(oid);
|
||||
continue_scrub();
|
||||
}
|
||||
};
|
||||
scrub_ops[oid] = osd_op;
|
||||
exec_op(osd_op);
|
||||
}
|
||||
|
||||
// Triggers scrub requests
|
||||
// Scrub reads data from all replicas and compares it
|
||||
// To scrub first we need to read objects listings
|
||||
bool osd_t::continue_scrub()
|
||||
{
|
||||
if (scrub_list_op)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (no_scrub)
|
||||
{
|
||||
// Return false = no more scrub work to do
|
||||
scrub_cur_list = {};
|
||||
scrub_last_pg = {};
|
||||
scrub_nearest_ts = 0;
|
||||
if (scrub_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(scrub_timer_id);
|
||||
scrub_timer_id = -1;
|
||||
}
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
if (pg_it->second.state & PG_SCRUBBING)
|
||||
{
|
||||
pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
while (scrub_ops.size() < scrub_queue_depth)
|
||||
{
|
||||
object_id oid;
|
||||
int r = pick_next_scrub(oid);
|
||||
if (r == 2)
|
||||
submit_scrub_op(oid);
|
||||
else
|
||||
return r;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::plan_scrub(pg_t & pg, bool report_state)
|
||||
{
|
||||
if ((pg.state & PG_ACTIVE) && !pg.next_scrub && auto_scrub)
|
||||
{
|
||||
timespec tv_now;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||
auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
|
||||
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
|
||||
if (pg.next_scrub != tv_now.tv_sec + interval)
|
||||
{
|
||||
pool_cfg.pg_config[pg.pg_num].next_scrub = pg.next_scrub = tv_now.tv_sec + interval;
|
||||
pg.history_changed = true;
|
||||
if (report_state)
|
||||
report_pg_state(pg);
|
||||
}
|
||||
schedule_scrub(pg);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::schedule_scrub(pg_t & pg)
|
||||
{
|
||||
if (!no_scrub && pg.next_scrub && (!scrub_nearest_ts || scrub_nearest_ts > pg.next_scrub))
|
||||
{
|
||||
scrub_nearest_ts = pg.next_scrub;
|
||||
timespec tv_now;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||
if (scrub_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(scrub_timer_id);
|
||||
scrub_timer_id = -1;
|
||||
}
|
||||
if (tv_now.tv_sec > scrub_nearest_ts)
|
||||
{
|
||||
scrub_nearest_ts = 0;
|
||||
peering_state = peering_state | OSD_SCRUBBING;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
else
|
||||
{
|
||||
scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
|
||||
{
|
||||
scrub_timer_id = -1;
|
||||
scrub_nearest_ts = 0;
|
||||
peering_state = peering_state | OSD_SCRUBBING;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
return;
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == 1)
|
||||
goto resume_1;
|
||||
else if (op_data->st == 2)
|
||||
goto resume_2;
|
||||
{
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// Determine version
|
||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
// PG may have degraded or misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
// Read all available chunks
|
||||
int n_copies = 0;
|
||||
op_data->degraded = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
op_data->stripes[role].write_buf = NULL;
|
||||
op_data->stripes[role].read_start = 0;
|
||||
op_data->stripes[role].read_end = bs_block_size;
|
||||
if (op_data->prev_set[role] != 0)
|
||||
{
|
||||
n_copies++;
|
||||
}
|
||||
else
|
||||
{
|
||||
op_data->stripes[role].missing = true;
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
{
|
||||
op_data->degraded = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (n_copies <= op_data->pg_data_size)
|
||||
{
|
||||
// Nothing to compare, even if we'd like to
|
||||
finish_op(cur_op, 0);
|
||||
return;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
|
||||
// Submit reads
|
||||
osd_op_t *subops = new osd_op_t[n_copies];
|
||||
op_data->fact_ver = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->n_subops = n_copies;
|
||||
op_data->subops = subops;
|
||||
int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
|
||||
op_data->stripes, op_data->prev_set, cur_op, 0, -1);
|
||||
assert(sent == n_copies);
|
||||
op_data->st = 1;
|
||||
}
|
||||
resume_1:
|
||||
return;
|
||||
resume_2:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// I/O or checksum error
|
||||
int n_copies = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].read_error)
|
||||
{
|
||||
op_data->stripes[role].missing = true;
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
{
|
||||
op_data->degraded = true;
|
||||
}
|
||||
}
|
||||
else if (!op_data->stripes[role].missing)
|
||||
{
|
||||
n_copies++;
|
||||
}
|
||||
}
|
||||
if (n_copies <= op_data->pg_data_size)
|
||||
{
|
||||
// Nothing to compare, just mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
||||
// Operation is treated as unsuccessful only if the object becomes unreadable
|
||||
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
|
||||
return;
|
||||
}
|
||||
// Proceed, we can still compare chunks that were successfully read
|
||||
}
|
||||
else
|
||||
{
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
bool inconsistent = false;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Check that all chunks have returned the same data
|
||||
int total = 0;
|
||||
int eq_to[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
eq_to[role] = -1;
|
||||
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
|
||||
!op_data->stripes[role].not_exists)
|
||||
{
|
||||
total++;
|
||||
eq_to[role] = role;
|
||||
for (int other = 0; other < role; other++)
|
||||
{
|
||||
// Only compare with unique chunks (eq_to[other] == other)
|
||||
if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
|
||||
{
|
||||
eq_to[role] = eq_to[other];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int votes[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
votes[role] = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (eq_to[role] != -1)
|
||||
votes[eq_to[role]]++;
|
||||
}
|
||||
int best = -1;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (votes[role] > (best >= 0 ? votes[best] : 0))
|
||||
best = role;
|
||||
}
|
||||
if (best >= 0 && votes[best] < total)
|
||||
{
|
||||
bool unknown = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (role != best && votes[role] == votes[best])
|
||||
{
|
||||
unknown = true;
|
||||
}
|
||||
if (votes[role] > 0 && votes[role] < votes[best])
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies%s\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
||||
op_data->stripes[role].osd_num, votes[best],
|
||||
scrub_find_best ? ", marking it as corrupted" : ""
|
||||
);
|
||||
if (scrub_find_best)
|
||||
{
|
||||
op_data->stripes[role].read_error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!scrub_find_best)
|
||||
{
|
||||
unknown = true;
|
||||
}
|
||||
if (unknown)
|
||||
{
|
||||
// It's unknown which replica is good. There are multiple versions with no majority
|
||||
// Mark all good replicas as ambiguous
|
||||
best = -1;
|
||||
inconsistent = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
|
||||
auto good_subset = ec_find_good(
|
||||
op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
|
||||
bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
|
||||
);
|
||||
if (!good_subset.size())
|
||||
{
|
||||
inconsistent = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
int total = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (!op_data->stripes[role].missing)
|
||||
{
|
||||
total++;
|
||||
op_data->stripes[role].read_error = true;
|
||||
}
|
||||
}
|
||||
for (int role: good_subset)
|
||||
{
|
||||
op_data->stripes[role].read_error = false;
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match other chunks%s\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
||||
role, op_data->stripes[role].osd_num,
|
||||
scrub_find_best ? ", marking it as corrupted" : ""
|
||||
);
|
||||
}
|
||||
}
|
||||
if (!scrub_find_best && good_subset.size() < total)
|
||||
{
|
||||
inconsistent = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||
);
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
||||
{
|
||||
// Undo error locator marking chunk as bad
|
||||
op_data->stripes[role].read_error = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].osd_num != 0 &&
|
||||
(op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
|
||||
inconsistent)
|
||||
{
|
||||
// Got at least 1 read error or mismatch, mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
|
||||
break;
|
||||
}
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
}
|
@@ -125,18 +125,11 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
|
||||
secondary_op_callback(cur_op);
|
||||
return;
|
||||
}
|
||||
cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size;
|
||||
cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count;
|
||||
cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1;
|
||||
cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode;
|
||||
cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe;
|
||||
cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode;
|
||||
if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX)
|
||||
{
|
||||
cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe
|
||||
? cur_op->req.sec_list.max_stripe : UINT64_MAX;
|
||||
}
|
||||
cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit;
|
||||
cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
|
||||
cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
|
||||
cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
|
||||
cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
|
||||
cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
|
||||
#ifdef OSD_STUB
|
||||
cur_op->bs_op->retval = 0;
|
||||
cur_op->bs_op->buf = NULL;
|
||||
|
@@ -3,9 +3,9 @@
|
||||
|
||||
#include "pg_states.h"
|
||||
|
||||
const int pg_state_bit_count = 17;
|
||||
const int pg_state_bit_count = 14;
|
||||
|
||||
const int pg_state_bits[17] = {
|
||||
const int pg_state_bits[14] = {
|
||||
PG_STARTING,
|
||||
PG_PEERING,
|
||||
PG_INCOMPLETE,
|
||||
@@ -14,18 +14,15 @@ const int pg_state_bits[17] = {
|
||||
PG_STOPPING,
|
||||
PG_OFFLINE,
|
||||
PG_DEGRADED,
|
||||
PG_HAS_INCONSISTENT,
|
||||
PG_HAS_CORRUPTED,
|
||||
PG_HAS_INCOMPLETE,
|
||||
PG_HAS_DEGRADED,
|
||||
PG_HAS_MISPLACED,
|
||||
PG_HAS_UNCLEAN,
|
||||
PG_HAS_INVALID,
|
||||
PG_LEFT_ON_DEAD,
|
||||
PG_SCRUBBING,
|
||||
};
|
||||
|
||||
const char *pg_state_names[17] = {
|
||||
const char *pg_state_names[14] = {
|
||||
"starting",
|
||||
"peering",
|
||||
"incomplete",
|
||||
@@ -34,37 +31,10 @@ const char *pg_state_names[17] = {
|
||||
"stopping",
|
||||
"offline",
|
||||
"degraded",
|
||||
"has_inconsistent",
|
||||
"has_corrupted",
|
||||
"has_incomplete",
|
||||
"has_degraded",
|
||||
"has_misplaced",
|
||||
"has_unclean",
|
||||
"has_invalid",
|
||||
"left_on_dead",
|
||||
"scrubbing",
|
||||
};
|
||||
|
||||
const int object_state_bit_count = 8;
|
||||
|
||||
const int object_state_bits[8] = {
|
||||
OBJ_DEGRADED,
|
||||
OBJ_INCOMPLETE,
|
||||
OBJ_MISPLACED,
|
||||
OBJ_CORRUPTED,
|
||||
OBJ_INCONSISTENT,
|
||||
OBJ_NEEDS_STABLE,
|
||||
OBJ_NEEDS_ROLLBACK,
|
||||
0,
|
||||
};
|
||||
|
||||
const char *object_state_names[8] = {
|
||||
"degraded",
|
||||
"incomplete",
|
||||
"misplaced",
|
||||
"corrupted",
|
||||
"inconsistent",
|
||||
"needs_stable",
|
||||
"needs_rollback",
|
||||
"clean",
|
||||
};
|
||||
|
@@ -22,10 +22,7 @@
|
||||
#define PG_HAS_MISPLACED (1<<10)
|
||||
#define PG_HAS_UNCLEAN (1<<11)
|
||||
#define PG_HAS_INVALID (1<<12)
|
||||
#define PG_HAS_CORRUPTED (1<<13)
|
||||
#define PG_HAS_INCONSISTENT (1<<14)
|
||||
#define PG_LEFT_ON_DEAD (1<<15)
|
||||
#define PG_SCRUBBING (1<<16)
|
||||
#define PG_LEFT_ON_DEAD (1<<13)
|
||||
|
||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
@@ -35,18 +32,9 @@
|
||||
#define OBJ_DEGRADED 0x02
|
||||
#define OBJ_INCOMPLETE 0x04
|
||||
#define OBJ_MISPLACED 0x08
|
||||
// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
|
||||
#define OBJ_CORRUPTED 0x10
|
||||
// OBJ_INCONSISTENT is when its replicas don't match, but it's unclear which one is correct
|
||||
// OBJ_INCONSISTENT may be set with CORRUPTED, but never with other states
|
||||
#define OBJ_INCONSISTENT 0x20
|
||||
#define OBJ_NEEDS_STABLE 0x10000
|
||||
#define OBJ_NEEDS_ROLLBACK 0x20000
|
||||
|
||||
extern const int pg_state_bits[];
|
||||
extern const char *pg_state_names[];
|
||||
extern const int pg_state_bit_count;
|
||||
|
||||
extern const int object_state_bits[];
|
||||
extern const char *object_state_names[];
|
||||
extern const int object_state_bit_count;
|
||||
|
@@ -3,7 +3,6 @@
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include "str_util.h"
|
||||
|
||||
std::string base64_encode(const std::string &in)
|
||||
@@ -250,53 +249,3 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
|
||||
fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
uint64_t parse_time(std::string time_str, bool *ok)
|
||||
{
|
||||
if (!time_str.length())
|
||||
{
|
||||
if (ok)
|
||||
*ok = false;
|
||||
return 0;
|
||||
}
|
||||
uint64_t mul = 1;
|
||||
char type_char = tolower(time_str[time_str.length()-1]);
|
||||
if (type_char == 's' || type_char == 'm' || type_char == 'h' || type_char == 'd' || type_char == 'y')
|
||||
{
|
||||
if (type_char == 's')
|
||||
mul = 1;
|
||||
else if (time_str[time_str.length()-1] == 'M')
|
||||
mul = 30*86400;
|
||||
else if (type_char == 'm')
|
||||
mul = 60;
|
||||
else if (type_char == 'h')
|
||||
mul = 3600;
|
||||
else if (type_char == 'd')
|
||||
mul = 86400;
|
||||
else /*if (type_char == 'y')*/
|
||||
mul = 86400*365;
|
||||
time_str = time_str.substr(0, time_str.length()-1);
|
||||
}
|
||||
uint64_t ts = stoull_full(time_str, 0) * mul;
|
||||
if (ok)
|
||||
*ok = !(ts == 0 && time_str != "0" && (time_str != "" || mul != 1));
|
||||
return ts;
|
||||
}
|
||||
|
||||
std::string read_all_fd(int fd)
|
||||
{
|
||||
int res_size = 0, res_alloc = 0;
|
||||
std::string res;
|
||||
while (1)
|
||||
{
|
||||
if (res_size >= res_alloc)
|
||||
res.resize((res_alloc = (res_alloc ? res_alloc*2 : 1024)));
|
||||
int r = read(fd, (char*)res.data()+res_size, res_alloc-res_size);
|
||||
if (r > 0)
|
||||
res_size += r;
|
||||
else if (!r || errno != EAGAIN && errno != EINTR)
|
||||
break;
|
||||
}
|
||||
res.resize(res_size);
|
||||
return res;
|
||||
}
|
||||
|
@@ -15,5 +15,3 @@ std::string str_replace(const std::string & in, const std::string & needle, cons
|
||||
uint64_t stoull_full(const std::string & str, int base = 0);
|
||||
std::string format_size(uint64_t size, bool nobytes = false);
|
||||
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
|
||||
uint64_t parse_time(std::string time_str, bool *ok = NULL);
|
||||
std::string read_all_fd(int fd);
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 0.9.1
|
||||
Version: 0.8.9
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -7,7 +7,7 @@ fi
|
||||
|
||||
format_error()
|
||||
{
|
||||
echo $(echo -n -e "\033[1;31m")"$0 $1"$(echo -n -e "\033[m")
|
||||
echo $(echo -n -e "\033[1;31m")"$1"$(echo -n -e "\033[m")
|
||||
$ETCDCTL get --prefix /vitastor > ./testdata/etcd-dump.txt
|
||||
exit 1
|
||||
}
|
||||
|
@@ -95,29 +95,19 @@ try_reweight()
|
||||
sleep 3
|
||||
}
|
||||
|
||||
wait_condition()
|
||||
{
|
||||
sec=$1
|
||||
check=$2
|
||||
proc=$3
|
||||
i=0
|
||||
while [[ $i -lt $sec ]]; do
|
||||
eval "$check" && break
|
||||
if [ $i -eq $sec ]; then
|
||||
format_error "$proc couldn't finish in $sec seconds"
|
||||
fi
|
||||
sleep 1
|
||||
i=$((i+1))
|
||||
done
|
||||
}
|
||||
|
||||
wait_finish_rebalance()
|
||||
{
|
||||
sec=$1
|
||||
check=$2
|
||||
check=${check:-'.state == ["active"] or .state == ["active", "left_on_dead"]'}
|
||||
check="$ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select($check) ] | length) == $PG_COUNT'"
|
||||
wait_condition "$sec" "$check" Rebalance
|
||||
i=0
|
||||
while [[ $i -lt $sec ]]; do
|
||||
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"] or .state == ["active", "left_on_dead"]) ] | length) == '$PG_COUNT) && \
|
||||
break
|
||||
sleep 1
|
||||
i=$((i+1))
|
||||
if [ $i -eq $sec ]; then
|
||||
format_error "Rebalance couldn't finish in $sec seconds"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
check_qemu()
|
||||
|
@@ -46,10 +46,3 @@ SCHEME=xor ./test_write.sh
|
||||
|
||||
PG_SIZE=2 ./test_heal.sh
|
||||
SCHEME=ec ./test_heal.sh
|
||||
|
||||
./test_scrub.sh
|
||||
ZERO_OSD=2 ./test_scrub.sh
|
||||
SCHEME=xor ./test_scrub.sh
|
||||
PG_SIZE=3 ./test_scrub.sh
|
||||
PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec ./test_scrub.sh
|
||||
SCHEME=ec ./test_scrub.sh
|
||||
|
@@ -46,8 +46,8 @@ kill_osds()
|
||||
kill_osds &
|
||||
|
||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bsrange=4k-128k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \
|
||||
-randrepeat=0 -refill_buffers=1 -mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
|
||||
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
|
||||
|
@@ -1,65 +0,0 @@
|
||||
#!/bin/bash -ex
|
||||
# Test for scrub without checksums
|
||||
|
||||
ZERO_OSD=${ZERO_OSD:-1}
|
||||
|
||||
if [[ ("$SCHEME" = "" || "$SCHEME" = "replicated") && ("$PG_SIZE" = "" || "$PG_SIZE" = 2) ]]; then
|
||||
OSD_COUNT=2
|
||||
fi
|
||||
|
||||
. `dirname $0`/run_3osds.sh
|
||||
|
||||
check_qemu
|
||||
|
||||
IMG_SIZE=128
|
||||
|
||||
$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"testimg","size":'$((IMG_SIZE*1024*1024))'}'
|
||||
|
||||
# Write
|
||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
|
||||
-mirror_file=./testdata/mirror.bin -end_fsync=1 -rw=write -etcd=$ETCD_URL -image=testimg
|
||||
|
||||
# Intentionally corrupt OSD data and restart it
|
||||
zero_osd_pid=OSD${ZERO_OSD}_PID
|
||||
kill ${!zero_osd_pid}
|
||||
sleep 1
|
||||
kill -9 ${!zero_osd_pid} || true
|
||||
data_offset=$(build/src/vitastor-disk simple-offsets ./testdata/test_osd$ZERO_OSD.bin $OFFSET_ARGS | grep data_offset | awk '{print $2}')
|
||||
truncate -s $data_offset ./testdata/test_osd$ZERO_OSD.bin
|
||||
dd if=/dev/zero of=./testdata/test_osd$ZERO_OSD.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
||||
$ETCDCTL del /vitastor/osd/state/$ZERO_OSD
|
||||
start_osd $ZERO_OSD
|
||||
|
||||
# Wait until start
|
||||
wait_up 10
|
||||
|
||||
# Trigger scrub
|
||||
$ETCDCTL put /vitastor/pg/history/1/1 `$ETCDCTL get --print-value-only /vitastor/pg/history/1/1 | jq -s -c '(.[0] // {}) + {"next_scrub":1}'`
|
||||
|
||||
# Wait for scrub to finish
|
||||
wait_condition 60 "$ETCDCTL get --prefix /vitastor/pg/history/ --print-value-only | jq -s -e '([ .[] | select(.next_scrub == 0 or .next_scrub == null) ] | length) == $PG_COUNT'" Scrubbing
|
||||
|
||||
if [[ ($SCHEME = replicated && $PG_SIZE < 3) || ($SCHEME != replicated && $((PG_SIZE-PG_DATA_SIZE)) < 2) ]]; then
|
||||
# Check that objects are marked as inconsistent if 2 replicas or EC/XOR 2+1
|
||||
build/src/vitastor-cli describe --etcd_address $ETCD_URL --json | jq -e '[ .[] | select(.inconsistent) ] | length == '$((IMG_SIZE * 8 * PG_SIZE / (SCHEME = replicated ? 1 : PG_DATA_SIZE)))
|
||||
|
||||
# Fix objects using vitastor-cli fix
|
||||
build/src/vitastor-cli describe --etcd_address $ETCD_URL --json | \
|
||||
jq -s '[ .[0][] | select(.inconsistent and .osd_num == '$ZERO_OSD') ]' | \
|
||||
build/src/vitastor-cli fix --etcd_address $ETCD_URL --bad_osds $ZERO_OSD
|
||||
elif [[ ($SCHEME = replicated && $PG_SIZE > 2) || ($SCHEME != replicated && $((PG_SIZE-PG_DATA_SIZE)) > 1) ]]; then
|
||||
# Check that everything heals
|
||||
wait_finish_rebalance 60
|
||||
|
||||
build/src/vitastor-cli describe --etcd_address $ETCD_URL --json | jq -e '. | length == 0'
|
||||
fi
|
||||
|
||||
# Read everything back
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
|
||||
-O raw ./testdata/read.bin
|
||||
|
||||
diff ./testdata/read.bin ./testdata/mirror.bin
|
||||
|
||||
format_green OK
|
Reference in New Issue
Block a user