Compare commits
72 Commits
4efb8bf38a
...
1b1e199496
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 1b1e199496 | |
Vitaliy Filippov | 5d8063a820 | |
Vitaliy Filippov | 6d7b6cc02d | |
Vitaliy Filippov | 6d71437891 | |
Vitaliy Filippov | a5db4be1b3 | |
Vitaliy Filippov | adbe9eee50 | |
Vitaliy Filippov | ac9f0ad35b | |
Vitaliy Filippov | 0acb7b57f9 | |
Vitaliy Filippov | 34b880fce0 | |
Vitaliy Filippov | ebefff393d | |
Vitaliy Filippov | 7e2eeb0994 | |
Vitaliy Filippov | f615274f44 | |
Vitaliy Filippov | a62dc47e3a | |
Vitaliy Filippov | 3d7b773218 | |
Vitaliy Filippov | 65bb046eef | |
Vitaliy Filippov | 5b87859160 | |
Vitaliy Filippov | e4f9d418ed | |
Vitaliy Filippov | d10f8eced5 | |
Vitaliy Filippov | 0ff17bd791 | |
Vitaliy Filippov | 0e2b690fe8 | |
Vitaliy Filippov | 509a0c9c32 | |
Vitaliy Filippov | 1e927b0ff1 | |
Vitaliy Filippov | e276d47d81 | |
Vitaliy Filippov | ebc4a46e95 | |
Vitaliy Filippov | 23f1851b7f | |
Vitaliy Filippov | cccfe5a985 | |
Vitaliy Filippov | 8f7164c8fd | |
Vitaliy Filippov | 1581853918 | |
Vitaliy Filippov | 1218f5718d | |
Vitaliy Filippov | 01fd5b6c34 | |
Vitaliy Filippov | 245855b368 | |
Vitaliy Filippov | f77c955134 | |
Vitaliy Filippov | 1ed5cc72e5 | |
Vitaliy Filippov | e8b93e7db9 | |
Vitaliy Filippov | a77bba3018 | |
Vitaliy Filippov | 5af63e574a | |
Vitaliy Filippov | e3e2a62e81 | |
Vitaliy Filippov | 170b381271 | |
Vitaliy Filippov | 1bc0b5aab3 | |
Vitaliy Filippov | 5e934264cf | |
Vitaliy Filippov | f20564b44b | |
Vitaliy Filippov | b3c15db331 | |
Vitaliy Filippov | 685bcd6ef9 | |
Vitaliy Filippov | 3eb389b321 | |
Vitaliy Filippov | 3d16cde23c | |
Vitaliy Filippov | c6406d67fc | |
Vitaliy Filippov | f87964861d | |
Vitaliy Filippov | 62a4f45160 | |
Vitaliy Filippov | 7048228678 | |
Vitaliy Filippov | ea73857450 | |
Vitaliy Filippov | 6cfe38ec04 | |
Vitaliy Filippov | 7ae5766fdb | |
Vitaliy Filippov | f882c7dd87 | |
Vitaliy Filippov | 26dd863c8d | |
Vitaliy Filippov | 2ae859fbc6 | |
Vitaliy Filippov | f6cd9f9153 | |
Vitaliy Filippov | 8389c0f33b | |
Vitaliy Filippov | 9db2196aef | |
Vitaliy Filippov | 8d6ae662fe | |
Vitaliy Filippov | c777a0041a | |
Vitaliy Filippov | 2947ea93e8 | |
Vitaliy Filippov | 978bdc128a | |
Vitaliy Filippov | bb2f395f1e | |
Vitaliy Filippov | b127da40f7 | |
Vitaliy Filippov | ca34a6047a | |
Vitaliy Filippov | 38ba76e893 | |
Vitaliy Filippov | 1e3c4edea0 | |
Vitaliy Filippov | e7ac855b07 | |
Vitaliy Filippov | c53357ac45 | |
Vitaliy Filippov | 27e9f244ec | |
Vitaliy Filippov | 8e25a28a08 | |
Vitaliy Filippov | 5d3317e4f2 |
|
@ -395,7 +395,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Run test
|
- name: Run test
|
||||||
id: test
|
id: test
|
||||||
timeout-minutes: 3
|
timeout-minutes: 6
|
||||||
run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
|
run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
|
||||||
- name: Print logs
|
- name: Print logs
|
||||||
if: always() && steps.test.outcome == 'failure'
|
if: always() && steps.test.outcome == 'failure'
|
||||||
|
|
|
@ -39,6 +39,10 @@ for my $line (<>)
|
||||||
$test_name .= '_'.lc($1).'_'.$2;
|
$test_name .= '_'.lc($1).'_'.$2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if ($test_name eq 'test_snapshot_chain_ec')
|
||||||
|
{
|
||||||
|
$timeout = 6;
|
||||||
|
}
|
||||||
$line =~ s!\./test_!/root/vitastor/tests/test_!;
|
$line =~ s!\./test_!/root/vitastor/tests/test_!;
|
||||||
# Gitea CI doesn't support artifacts yet, lol
|
# Gitea CI doesn't support artifacts yet, lol
|
||||||
#- name: Upload results
|
#- name: Upload results
|
||||||
|
|
|
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "1.4.2")
|
set(VERSION "1.4.7")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
VERSION ?= v1.4.2
|
VERSION ?= v1.4.7
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ spec:
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v1.4.2
|
image: vitalif/vitastor-csi:v1.4.7
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -121,7 +121,7 @@ spec:
|
||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v1.4.2
|
image: vitalif/vitastor-csi:v1.4.7
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -5,7 +5,7 @@ package vitastor
|
||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "1.4.2"
|
vitastorCSIDriverVersion = "1.4.7"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
vitastor (1.4.2-1) unstable; urgency=medium
|
vitastor (1.4.7-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
|
|
|
@ -35,8 +35,8 @@ RUN set -e -x; \
|
||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-1.4.2; \
|
cp -r /root/vitastor vitastor-1.4.7; \
|
||||||
cd vitastor-1.4.2; \
|
cd vitastor-1.4.7; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
|
@ -49,8 +49,8 @@ RUN set -e -x; \
|
||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.2.orig.tar.xz vitastor-1.4.2; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.7.orig.tar.xz vitastor-1.4.7; \
|
||||||
cd vitastor-1.4.2; \
|
cd vitastor-1.4.7; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
|
|
@ -19,8 +19,8 @@ These parameters only apply to Monitors.
|
||||||
## etcd_mon_ttl
|
## etcd_mon_ttl
|
||||||
|
|
||||||
- Type: seconds
|
- Type: seconds
|
||||||
- Default: 30
|
- Default: 1
|
||||||
- Minimum: 10
|
- Minimum: 5
|
||||||
|
|
||||||
Monitor etcd lease refresh interval in seconds
|
Monitor etcd lease refresh interval in seconds
|
||||||
|
|
||||||
|
|
|
@ -19,8 +19,8 @@
|
||||||
## etcd_mon_ttl
|
## etcd_mon_ttl
|
||||||
|
|
||||||
- Тип: секунды
|
- Тип: секунды
|
||||||
- Значение по умолчанию: 30
|
- Значение по умолчанию: 1
|
||||||
- Минимальное значение: 10
|
- Минимальное значение: 5
|
||||||
|
|
||||||
Интервал обновления etcd резервации (lease) монитором
|
Интервал обновления etcd резервации (lease) монитором
|
||||||
|
|
||||||
|
|
|
@ -215,8 +215,8 @@ is scheduled.
|
||||||
## up_wait_retry_interval
|
## up_wait_retry_interval
|
||||||
|
|
||||||
- Type: milliseconds
|
- Type: milliseconds
|
||||||
- Default: 500
|
- Default: 50
|
||||||
- Minimum: 50
|
- Minimum: 10
|
||||||
- Can be changed online: yes
|
- Can be changed online: yes
|
||||||
|
|
||||||
OSDs respond to clients with a special error code when they receive I/O
|
OSDs respond to clients with a special error code when they receive I/O
|
||||||
|
|
|
@ -224,8 +224,8 @@ OSD в любом случае согласовывают реальное зн
|
||||||
## up_wait_retry_interval
|
## up_wait_retry_interval
|
||||||
|
|
||||||
- Тип: миллисекунды
|
- Тип: миллисекунды
|
||||||
- Значение по умолчанию: 500
|
- Значение по умолчанию: 50
|
||||||
- Минимальное значение: 50
|
- Минимальное значение: 10
|
||||||
- Можно менять на лету: да
|
- Можно менять на лету: да
|
||||||
|
|
||||||
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
||||||
|
|
|
@ -59,6 +59,7 @@ them, even without restarting by updating configuration in etcd.
|
||||||
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
|
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
|
||||||
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
||||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||||
|
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||||
|
|
||||||
## etcd_report_interval
|
## etcd_report_interval
|
||||||
|
|
||||||
|
@ -604,5 +605,14 @@ is usually fine.
|
||||||
- Default: 10
|
- Default: 10
|
||||||
- Can be changed online: yes
|
- Can be changed online: yes
|
||||||
|
|
||||||
Minimum possible value for auto-tuned recovery_sleep_us. Values lower
|
Minimum possible value for auto-tuned recovery_sleep_us. Lower values
|
||||||
than this value are changed to 0.
|
are changed to 0.
|
||||||
|
|
||||||
|
## recovery_tune_sleep_cutoff_us
|
||||||
|
|
||||||
|
- Type: microseconds
|
||||||
|
- Default: 10000000
|
||||||
|
- Can be changed online: yes
|
||||||
|
|
||||||
|
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
|
||||||
|
are treated as outliers and ignored in aggregation.
|
||||||
|
|
|
@ -60,6 +60,7 @@
|
||||||
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
|
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
|
||||||
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
||||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||||
|
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||||
|
|
||||||
## etcd_report_interval
|
## etcd_report_interval
|
||||||
|
|
||||||
|
@ -634,4 +635,14 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
|
||||||
- Можно менять на лету: да
|
- Можно менять на лету: да
|
||||||
|
|
||||||
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
|
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||||
Значения ниже данного заменяются на 0.
|
Меньшие значения заменяются на 0.
|
||||||
|
|
||||||
|
## recovery_tune_sleep_cutoff_us
|
||||||
|
|
||||||
|
- Тип: микросекунды
|
||||||
|
- Значение по умолчанию: 10000000
|
||||||
|
- Можно менять на лету: да
|
||||||
|
|
||||||
|
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||||
|
Большие значения считаются случайными выбросами и игнорируются в
|
||||||
|
усреднении.
|
||||||
|
|
|
@ -154,6 +154,9 @@ That is, if it becomes impossible to place PG data on at least (pg_minsize)
|
||||||
OSDs, PG is deactivated for both read and write. So you know that a fresh
|
OSDs, PG is deactivated for both read and write. So you know that a fresh
|
||||||
write always goes to at least (pg_minsize) OSDs (disks).
|
write always goes to at least (pg_minsize) OSDs (disks).
|
||||||
|
|
||||||
|
That is, pg_size minus pg_minsize sets the number of disk failures to tolerate
|
||||||
|
without temporary downtime (for [osd_out_time](monitor.en.md#osd_out_time)).
|
||||||
|
|
||||||
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
|
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
|
||||||
read-only instead of deactivating them.
|
read-only instead of deactivating them.
|
||||||
|
|
||||||
|
|
|
@ -157,6 +157,10 @@
|
||||||
OSD, PG деактивируется на чтение и запись. Иными словами, всегда известно,
|
OSD, PG деактивируется на чтение и запись. Иными словами, всегда известно,
|
||||||
что новые блоки данных всегда записываются как минимум на pg_minsize дисков.
|
что новые блоки данных всегда записываются как минимум на pg_minsize дисков.
|
||||||
|
|
||||||
|
По сути, разница pg_size и pg_minsize задаёт число отказов дисков, которые пул
|
||||||
|
может пережить без временной (на [osd_out_time](monitor.ru.md#osd_out_time))
|
||||||
|
остановки обслуживания.
|
||||||
|
|
||||||
FIXME: Поведение pg_minsize может быть изменено в будущем с полной деактивации
|
FIXME: Поведение pg_minsize может быть изменено в будущем с полной деактивации
|
||||||
PG на перевод их в режим только для чтения.
|
PG на перевод их в режим только для чтения.
|
||||||
|
|
||||||
|
|
|
@ -731,8 +731,19 @@
|
||||||
default: 10
|
default: 10
|
||||||
online: true
|
online: true
|
||||||
info: |
|
info: |
|
||||||
Minimum possible value for auto-tuned recovery_sleep_us. Values lower
|
Minimum possible value for auto-tuned recovery_sleep_us. Lower values
|
||||||
than this value are changed to 0.
|
are changed to 0.
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
|
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||||
Значения ниже данного заменяются на 0.
|
Меньшие значения заменяются на 0.
|
||||||
|
- name: recovery_tune_sleep_cutoff_us
|
||||||
|
type: us
|
||||||
|
default: 10000000
|
||||||
|
online: true
|
||||||
|
info: |
|
||||||
|
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
|
||||||
|
are treated as outliers and ignored in aggregation.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||||
|
Большие значения считаются случайными выбросами и игнорируются в
|
||||||
|
усреднении.
|
||||||
|
|
|
@ -261,7 +261,7 @@ Options (see also [Cluster-Wide Disk Layout Parameters](../config/layout-cluster
|
||||||
```
|
```
|
||||||
--object_size 128k Set blockstore block size
|
--object_size 128k Set blockstore block size
|
||||||
--bitmap_granularity 4k Set bitmap granularity
|
--bitmap_granularity 4k Set bitmap granularity
|
||||||
--journal_size 16M Set journal size
|
--journal_size 32M Set journal size
|
||||||
--data_csum_type none Set data checksum type (crc32c or none)
|
--data_csum_type none Set data checksum type (crc32c or none)
|
||||||
--csum_block_size 4k Set data checksum block size
|
--csum_block_size 4k Set data checksum block size
|
||||||
--device_block_size 4k Set device block size
|
--device_block_size 4k Set device block size
|
||||||
|
|
|
@ -267,7 +267,7 @@ OSD отключены fsync-и.
|
||||||
```
|
```
|
||||||
--object_size 128k Размер блока хранилища
|
--object_size 128k Размер блока хранилища
|
||||||
--bitmap_granularity 4k Гранулярность битовых карт
|
--bitmap_granularity 4k Гранулярность битовых карт
|
||||||
--journal_size 16M Размер журнала
|
--journal_size 32M Размер журнала
|
||||||
--data_csum_type none Задать тип контрольных сумм (crc32c или none)
|
--data_csum_type none Задать тип контрольных сумм (crc32c или none)
|
||||||
--csum_block_size 4k Задать размер блока расчёта контрольных сумм
|
--csum_block_size 4k Задать размер блока расчёта контрольных сумм
|
||||||
--device_block_size 4k Размер блока устройства
|
--device_block_size 4k Размер блока устройства
|
||||||
|
|
28
mon/mon.js
28
mon/mon.js
|
@ -675,7 +675,12 @@ class Mon
|
||||||
{
|
{
|
||||||
this.parse_kv(e.kv);
|
this.parse_kv(e.kv);
|
||||||
const key = e.kv.key.substr(this.etcd_prefix.length);
|
const key = e.kv.key.substr(this.etcd_prefix.length);
|
||||||
if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
|
if (key.substr(0, 11) == '/osd/state/')
|
||||||
|
{
|
||||||
|
stats_changed = true;
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
|
||||||
{
|
{
|
||||||
stats_changed = true;
|
stats_changed = true;
|
||||||
}
|
}
|
||||||
|
@ -1635,9 +1640,13 @@ class Mon
|
||||||
}
|
}
|
||||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||||
// Sum derived values instead of deriving summed
|
// Sum derived values instead of deriving summed
|
||||||
for (const osd in this.state.osd.stats)
|
for (const osd in this.state.osd.state)
|
||||||
{
|
{
|
||||||
const derived = this.prev_stats.osd_diff[osd];
|
const derived = this.prev_stats.osd_diff[osd];
|
||||||
|
if (!this.state.osd.state[osd] || !derived)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (const type in sum_diff)
|
for (const type in sum_diff)
|
||||||
{
|
{
|
||||||
for (const op in derived[type]||{})
|
for (const op in derived[type]||{})
|
||||||
|
@ -1738,9 +1747,13 @@ class Mon
|
||||||
const used = this.state.pool.stats[pool_id].used_raw_tb;
|
const used = this.state.pool.stats[pool_id].used_raw_tb;
|
||||||
this.state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
|
this.state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
|
||||||
}
|
}
|
||||||
for (const osd_num in this.state.osd.inodestats)
|
for (const osd_num in this.state.osd.state)
|
||||||
{
|
{
|
||||||
const ist = this.state.osd.inodestats[osd_num];
|
const ist = this.state.osd.inodestats[osd_num];
|
||||||
|
if (!ist || !this.state.osd.state[osd_num])
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (const pool_id in ist)
|
for (const pool_id in ist)
|
||||||
{
|
{
|
||||||
inode_stats[pool_id] = inode_stats[pool_id] || {};
|
inode_stats[pool_id] = inode_stats[pool_id] || {};
|
||||||
|
@ -1756,9 +1769,14 @@ class Mon
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (const osd in this.prev_stats.osd_diff)
|
for (const osd in this.state.osd.state)
|
||||||
{
|
{
|
||||||
for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
|
const osd_diff = this.prev_stats.osd_diff[osd];
|
||||||
|
if (!osd_diff || !this.state.osd.state[osd])
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (const pool_id in osd_diff.inode_stats)
|
||||||
{
|
{
|
||||||
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
|
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "vitastor-mon",
|
"name": "vitastor-mon",
|
||||||
"version": "1.4.2",
|
"version": "1.4.7",
|
||||||
"description": "Vitastor SDS monitor service",
|
"description": "Vitastor SDS monitor service",
|
||||||
"main": "mon-main.js",
|
"main": "mon-main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|
|
@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '1.4.2'
|
VERSION = '1.4.7'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -24,4 +24,4 @@ rm fio
|
||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-1.4.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.2$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-1.4.7/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.7$(rpm --eval '%dist').tar.gz *
|
||||||
|
|
|
@ -36,7 +36,7 @@ ADD . /root/vitastor
|
||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.4.2.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.4.7.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.4.2
|
Version: 1.4.7
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.4.2.el7.tar.gz
|
Source0: vitastor-1.4.7.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.4.2.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.4.7.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.4.2
|
Version: 1.4.7
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.4.2.el8.tar.gz
|
Source0: vitastor-1.4.7.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -18,7 +18,7 @@ ADD . /root/vitastor
|
||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.4.2.el9.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.4.7.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.4.2
|
Version: 1.4.7
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.4.2.el9.tar.gz
|
Source0: vitastor-1.4.7.el9.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -16,8 +16,8 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="1.4.2")
|
add_definitions(-DVERSION="1.4.7")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
add_link_options(-fno-omit-frame-pointer)
|
add_link_options(-fno-omit-frame-pointer)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address)
|
add_definitions(-fsanitize=address)
|
||||||
|
|
|
@ -108,6 +108,10 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||||
{
|
{
|
||||||
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||||
}
|
}
|
||||||
|
else if (journal_block_size > MAX_DATA_BLOCK_SIZE)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("journal_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||||
|
}
|
||||||
if (!meta_block_size)
|
if (!meta_block_size)
|
||||||
{
|
{
|
||||||
meta_block_size = 4096;
|
meta_block_size = 4096;
|
||||||
|
@ -116,6 +120,10 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||||
{
|
{
|
||||||
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||||
}
|
}
|
||||||
|
else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||||
|
}
|
||||||
if (data_offset % disk_alignment)
|
if (data_offset % disk_alignment)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
|
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
|
||||||
|
|
|
@ -19,7 +19,6 @@ journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
|
||||||
syncing_flushers = 0;
|
syncing_flushers = 0;
|
||||||
// FIXME: allow to configure flusher_start_threshold and journal_trim_interval
|
// FIXME: allow to configure flusher_start_threshold and journal_trim_interval
|
||||||
flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable);
|
flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable);
|
||||||
journal_trim_interval = 512;
|
|
||||||
journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
|
journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
|
||||||
trim_wanted = bs->journal.flush_journal ? 1 : 0;
|
trim_wanted = bs->journal.flush_journal ? 1 : 0;
|
||||||
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size);
|
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size);
|
||||||
|
@ -94,7 +93,7 @@ void journal_flusher_t::loop()
|
||||||
void journal_flusher_t::enqueue_flush(obj_ver_id ov)
|
void journal_flusher_t::enqueue_flush(obj_ver_id ov)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
printf("enqueue_flush %jx:%jx v%ju\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||||
#endif
|
#endif
|
||||||
auto it = flush_versions.find(ov.oid);
|
auto it = flush_versions.find(ov.oid);
|
||||||
if (it != flush_versions.end())
|
if (it != flush_versions.end())
|
||||||
|
@ -117,7 +116,7 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
|
||||||
void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
|
void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
printf("unshift_flush %jx:%jx v%ju\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||||
#endif
|
#endif
|
||||||
auto it = flush_versions.find(ov.oid);
|
auto it = flush_versions.find(ov.oid);
|
||||||
if (it != flush_versions.end())
|
if (it != flush_versions.end())
|
||||||
|
@ -143,7 +142,7 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
|
||||||
void journal_flusher_t::remove_flush(object_id oid)
|
void journal_flusher_t::remove_flush(object_id oid)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
|
printf("undo_flush %jx:%jx\n", oid.inode, oid.stripe);
|
||||||
#endif
|
#endif
|
||||||
auto v_it = flush_versions.find(oid);
|
auto v_it = flush_versions.find(oid);
|
||||||
if (v_it != flush_versions.end())
|
if (v_it != flush_versions.end())
|
||||||
|
@ -184,8 +183,7 @@ void journal_flusher_t::mark_trim_possible()
|
||||||
if (trim_wanted > 0)
|
if (trim_wanted > 0)
|
||||||
{
|
{
|
||||||
dequeuing = true;
|
dequeuing = true;
|
||||||
if (!journal_trim_counter)
|
journal_trim_counter = 0;
|
||||||
journal_trim_counter = journal_trim_interval;
|
|
||||||
bs->ringloop->wakeup();
|
bs->ringloop->wakeup();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -235,7 +233,7 @@ void journal_flusher_t::dump_diagnostics()
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"Flusher: queued=%ld first=%s%lx:%lx trim_wanted=%d dequeuing=%d trimming=%d cur=%d target=%d active=%d syncing=%d\n",
|
"Flusher: queued=%zd first=%s%jx:%jx trim_wanted=%d dequeuing=%d trimming=%d cur=%d target=%d active=%d syncing=%d\n",
|
||||||
flush_queue.size(), unflushable_type, unflushable.oid.inode, unflushable.oid.stripe,
|
flush_queue.size(), unflushable_type, unflushable.oid.inode, unflushable.oid.stripe,
|
||||||
trim_wanted, dequeuing, trimming, cur_flusher_count, target_flusher_count,
|
trim_wanted, dequeuing, trimming, cur_flusher_count, target_flusher_count,
|
||||||
active_flushers, syncing_flushers
|
active_flushers, syncing_flushers
|
||||||
|
@ -268,7 +266,7 @@ bool journal_flusher_t::try_find_other(std::map<obj_ver_id, dirty_entry>::iterat
|
||||||
{
|
{
|
||||||
int search_left = flush_queue.size() - 1;
|
int search_left = flush_queue.size() - 1;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Flusher overran writers (%lx:%lx v%lu, dirty_start=%08lx) - searching for older flushes (%d left)\n",
|
printf("Flusher overran writers (%jx:%jx v%ju, dirty_start=%08jx) - searching for older flushes (%d left)\n",
|
||||||
cur.oid.inode, cur.oid.stripe, cur.version, bs->journal.dirty_start, search_left);
|
cur.oid.inode, cur.oid.stripe, cur.version, bs->journal.dirty_start, search_left);
|
||||||
#endif
|
#endif
|
||||||
while (search_left > 0)
|
while (search_left > 0)
|
||||||
|
@ -285,7 +283,7 @@ bool journal_flusher_t::try_find_other(std::map<obj_ver_id, dirty_entry>::iterat
|
||||||
dirty_end->second.journal_sector < bs->journal.used_start))
|
dirty_end->second.journal_sector < bs->journal.used_start))
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Write %lx:%lx v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
|
printf("Write %jx:%jx v%ju is too new: offset=%08jx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
|
||||||
#endif
|
#endif
|
||||||
enqueue_flush(cur);
|
enqueue_flush(cur);
|
||||||
}
|
}
|
||||||
|
@ -366,9 +364,10 @@ resume_0:
|
||||||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||||
{
|
{
|
||||||
stop_flusher:
|
stop_flusher:
|
||||||
if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
|
if (flusher->trim_wanted > 0 && cur.oid.inode != 0)
|
||||||
{
|
{
|
||||||
// Attempt forced trim
|
// Attempt forced trim
|
||||||
|
cur.oid = {};
|
||||||
flusher->active_flushers++;
|
flusher->active_flushers++;
|
||||||
goto trim_journal;
|
goto trim_journal;
|
||||||
}
|
}
|
||||||
|
@ -387,7 +386,7 @@ stop_flusher:
|
||||||
if (repeat_it != flusher->sync_to_repeat.end())
|
if (repeat_it != flusher->sync_to_repeat.end())
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Postpone %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
printf("Postpone %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||||
#endif
|
#endif
|
||||||
// We don't flush different parts of history of the same object in parallel
|
// We don't flush different parts of history of the same object in parallel
|
||||||
// So we check if someone is already flushing this object
|
// So we check if someone is already flushing this object
|
||||||
|
@ -416,12 +415,13 @@ stop_flusher:
|
||||||
flusher->sync_to_repeat.erase(cur.oid);
|
flusher->sync_to_repeat.erase(cur.oid);
|
||||||
if (!flusher->try_find_other(dirty_end, cur))
|
if (!flusher->try_find_other(dirty_end, cur))
|
||||||
{
|
{
|
||||||
|
cur.oid = {};
|
||||||
goto stop_flusher;
|
goto stop_flusher;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Flushing %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||||
#endif
|
#endif
|
||||||
flusher->active_flushers++;
|
flusher->active_flushers++;
|
||||||
// Find it in clean_db
|
// Find it in clean_db
|
||||||
|
@ -448,7 +448,7 @@ stop_flusher:
|
||||||
// Object not allocated. This is a bug.
|
// Object not allocated. This is a bug.
|
||||||
char err[1024];
|
char err[1024];
|
||||||
snprintf(
|
snprintf(
|
||||||
err, 1024, "BUG: Object %lx:%lx v%lu that we are trying to flush is not allocated on the data device",
|
err, 1024, "BUG: Object %jx:%jx v%ju that we are trying to flush is not allocated on the data device",
|
||||||
cur.oid.inode, cur.oid.stripe, cur.version
|
cur.oid.inode, cur.oid.stripe, cur.version
|
||||||
);
|
);
|
||||||
throw std::runtime_error(err);
|
throw std::runtime_error(err);
|
||||||
|
@ -538,7 +538,7 @@ resume_2:
|
||||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
|
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
|
||||||
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
||||||
{
|
{
|
||||||
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %lu (%lx:%lx v%lu) as old location of %lx:%lx\n",
|
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
|
||||||
old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
|
old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
|
||||||
old_entry->version, cur.oid.inode, cur.oid.stripe);
|
old_entry->version, cur.oid.inode, cur.oid.stripe);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -571,7 +571,7 @@ resume_2:
|
||||||
// Erase dirty_db entries
|
// Erase dirty_db entries
|
||||||
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
|
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Flushed %lx:%lx v%lu (%d copies, wr:%d, del:%d), %ld left\n", cur.oid.inode, cur.oid.stripe, cur.version,
|
printf("Flushed %jx:%jx v%ju (%d copies, wr:%d, del:%d), %jd left\n", cur.oid.inode, cur.oid.stripe, cur.version,
|
||||||
copy_count, has_writes, has_delete, flusher->flush_queue.size());
|
copy_count, has_writes, has_delete, flusher->flush_queue.size());
|
||||||
#endif
|
#endif
|
||||||
release_oid:
|
release_oid:
|
||||||
|
@ -584,7 +584,8 @@ resume_2:
|
||||||
flusher->sync_to_repeat.erase(repeat_it);
|
flusher->sync_to_repeat.erase(repeat_it);
|
||||||
trim_journal:
|
trim_journal:
|
||||||
// Clear unused part of the journal every <journal_trim_interval> flushes
|
// Clear unused part of the journal every <journal_trim_interval> flushes
|
||||||
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
|
if (bs->journal_trim_interval && !((++flusher->journal_trim_counter) % bs->journal_trim_interval) ||
|
||||||
|
flusher->trim_wanted > 0)
|
||||||
{
|
{
|
||||||
resume_26:
|
resume_26:
|
||||||
resume_27:
|
resume_27:
|
||||||
|
@ -609,8 +610,8 @@ void journal_flusher_co::update_metadata_entry()
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
has_delete
|
has_delete
|
||||||
? "Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx v%lu\n"
|
? "Fatal error (metadata corruption or bug): tried to delete metadata entry %ju (%jx:%jx v%ju) while deleting %jx:%jx v%ju\n"
|
||||||
: "Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
|
: "Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %ju (%jx:%jx v%ju) with %jx:%jx v%ju\n",
|
||||||
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
|
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
|
||||||
new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
|
new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
|
||||||
);
|
);
|
||||||
|
@ -710,7 +711,7 @@ bool journal_flusher_co::write_meta_block(flusher_meta_write_t & meta_block, int
|
||||||
if (wait_state == wait_base)
|
if (wait_state == wait_base)
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
await_sqe(0);
|
await_sqe(0);
|
||||||
data->iov = (struct iovec){ meta_block.buf, bs->dsk.meta_block_size };
|
data->iov = (struct iovec){ meta_block.buf, (size_t)bs->dsk.meta_block_size };
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
my_uring_prep_writev(
|
my_uring_prep_writev(
|
||||||
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_block.sector
|
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_block.sector
|
||||||
|
@ -760,7 +761,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||||
{
|
{
|
||||||
// If we encounter bad checksums during flush, we still update the bad block,
|
// If we encounter bad checksums during flush, we still update the bad block,
|
||||||
// but intentionally mangle checksums to avoid hiding the corruption.
|
// but intentionally mangle checksums to avoid hiding the corruption.
|
||||||
iovec iov = { .iov_base = v[i].buf, .iov_len = v[i].len };
|
iovec iov = { .iov_base = v[i].buf, .iov_len = (size_t)v[i].len };
|
||||||
if (!(v[i].copy_flags & COPY_BUF_JOURNAL))
|
if (!(v[i].copy_flags & COPY_BUF_JOURNAL))
|
||||||
{
|
{
|
||||||
assert(!(v[i].offset % bs->dsk.csum_block_size));
|
assert(!(v[i].offset % bs->dsk.csum_block_size));
|
||||||
|
@ -768,7 +769,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||||
bs->verify_padded_checksums(new_clean_bitmap, new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size,
|
bs->verify_padded_checksums(new_clean_bitmap, new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size,
|
||||||
v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||||
{
|
{
|
||||||
printf("Checksum mismatch in object %lx:%lx v%lu in data area at offset 0x%lx+0x%x: got %08x, expected %08x\n",
|
printf("Checksum mismatch in object %jx:%jx v%ju in data area at offset 0x%jx+0x%x: got %08x, expected %08x\n",
|
||||||
cur.oid.inode, cur.oid.stripe, old_clean_ver, old_clean_loc, bad_block, calc_csum, stored_csum);
|
cur.oid.inode, cur.oid.stripe, old_clean_ver, old_clean_loc, bad_block, calc_csum, stored_csum);
|
||||||
for (uint32_t j = 0; j < bs->dsk.csum_block_size; j += bs->dsk.bitmap_granularity)
|
for (uint32_t j = 0; j < bs->dsk.csum_block_size; j += bs->dsk.bitmap_granularity)
|
||||||
{
|
{
|
||||||
|
@ -781,7 +782,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||||
{
|
{
|
||||||
bs->verify_journal_checksums(v[i].csum_buf, v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
bs->verify_journal_checksums(v[i].csum_buf, v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||||
{
|
{
|
||||||
printf("Checksum mismatch in object %lx:%lx v%lu in journal at offset 0x%lx+0x%x (block offset 0x%lx): got %08x, expected %08x\n",
|
printf("Checksum mismatch in object %jx:%jx v%ju in journal at offset 0x%jx+0x%x (block offset 0x%jx): got %08x, expected %08x\n",
|
||||||
cur.oid.inode, cur.oid.stripe, old_clean_ver,
|
cur.oid.inode, cur.oid.stripe, old_clean_ver,
|
||||||
v[i].disk_offset, bad_block, v[i].offset, calc_csum, stored_csum);
|
v[i].disk_offset, bad_block, v[i].offset, calc_csum, stored_csum);
|
||||||
bad_block += (v[i].offset/bs->dsk.csum_block_size) * bs->dsk.csum_block_size;
|
bad_block += (v[i].offset/bs->dsk.csum_block_size) * bs->dsk.csum_block_size;
|
||||||
|
@ -805,7 +806,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||||
if (new_entry->oid != cur.oid)
|
if (new_entry->oid != cur.oid)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Fatal error (metadata corruption or bug): tried to make holes in %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
|
"Fatal error (metadata corruption or bug): tried to make holes in %ju (%jx:%jx v%ju) with %jx:%jx v%ju\n",
|
||||||
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
|
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
|
||||||
new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
|
new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
|
||||||
);
|
);
|
||||||
|
@ -925,7 +926,7 @@ void journal_flusher_co::scan_dirty()
|
||||||
{
|
{
|
||||||
char err[1024];
|
char err[1024];
|
||||||
snprintf(
|
snprintf(
|
||||||
err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: 0x%x",
|
err, 1024, "BUG: Unexpected dirty_entry %jx:%jx v%ju unstable state during flush: 0x%x",
|
||||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
||||||
);
|
);
|
||||||
throw std::runtime_error(err);
|
throw std::runtime_error(err);
|
||||||
|
@ -1021,7 +1022,7 @@ void journal_flusher_co::scan_dirty()
|
||||||
// May happen if the metadata entry is corrupt, but journal isn't
|
// May happen if the metadata entry is corrupt, but journal isn't
|
||||||
// FIXME: Report corrupted object to the upper layer (OSD)
|
// FIXME: Report corrupted object to the upper layer (OSD)
|
||||||
printf(
|
printf(
|
||||||
"Warning: object %lx:%lx has overwrites, but doesn't have a clean version."
|
"Warning: object %jx:%jx has overwrites, but doesn't have a clean version."
|
||||||
" Metadata is likely corrupted. Dropping object from the DB.\n",
|
" Metadata is likely corrupted. Dropping object from the DB.\n",
|
||||||
cur.oid.inode, cur.oid.stripe
|
cur.oid.inode, cur.oid.stripe
|
||||||
);
|
);
|
||||||
|
@ -1056,7 +1057,7 @@ void journal_flusher_co::scan_dirty()
|
||||||
flusher->enqueue_flush(cur);
|
flusher->enqueue_flush(cur);
|
||||||
cur.version = dirty_end->first.version;
|
cur.version = dirty_end->first.version;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Partial checksum block overwrites found - rewinding flush back to %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
printf("Partial checksum block overwrites found - rewinding flush back to %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||||
#endif
|
#endif
|
||||||
v.clear();
|
v.clear();
|
||||||
copy_count = 0;
|
copy_count = 0;
|
||||||
|
@ -1084,7 +1085,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
|
||||||
auto & vi = v[v.size()-i];
|
auto & vi = v[v.size()-i];
|
||||||
assert(vi.len != 0);
|
assert(vi.len != 0);
|
||||||
vi.buf = memalign_or_die(MEM_ALIGNMENT, vi.len);
|
vi.buf = memalign_or_die(MEM_ALIGNMENT, vi.len);
|
||||||
data->iov = (struct iovec){ vi.buf, vi.len };
|
data->iov = (struct iovec){ vi.buf, (size_t)vi.len };
|
||||||
data->callback = simple_callback_r;
|
data->callback = simple_callback_r;
|
||||||
my_uring_prep_readv(
|
my_uring_prep_readv(
|
||||||
sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
|
sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
|
||||||
|
@ -1208,7 +1209,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
|
||||||
.usage_count = 1,
|
.usage_count = 1,
|
||||||
}).first;
|
}).first;
|
||||||
await_sqe(0);
|
await_sqe(0);
|
||||||
data->iov = (struct iovec){ wr.it->second.buf, bs->dsk.meta_block_size };
|
data->iov = (struct iovec){ wr.it->second.buf, (size_t)bs->dsk.meta_block_size };
|
||||||
data->callback = simple_callback_r;
|
data->callback = simple_callback_r;
|
||||||
wr.submitted = true;
|
wr.submitted = true;
|
||||||
my_uring_prep_readv(
|
my_uring_prep_readv(
|
||||||
|
@ -1247,7 +1248,7 @@ void journal_flusher_co::free_data_blocks()
|
||||||
auto uo_it = bs->used_clean_objects.find(old_clean_loc);
|
auto uo_it = bs->used_clean_objects.find(old_clean_loc);
|
||||||
bool used = uo_it != bs->used_clean_objects.end();
|
bool used = uo_it != bs->used_clean_objects.end();
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("%s block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
printf("%s block %ju from %jx:%jx v%ju (new location is %ju)\n",
|
||||||
used ? "Postpone free" : "Free",
|
used ? "Postpone free" : "Free",
|
||||||
old_clean_loc >> bs->dsk.block_order,
|
old_clean_loc >> bs->dsk.block_order,
|
||||||
cur.oid.inode, cur.oid.stripe, cur.version,
|
cur.oid.inode, cur.oid.stripe, cur.version,
|
||||||
|
@ -1264,7 +1265,7 @@ void journal_flusher_co::free_data_blocks()
|
||||||
auto uo_it = bs->used_clean_objects.find(old_clean_loc);
|
auto uo_it = bs->used_clean_objects.find(old_clean_loc);
|
||||||
bool used = uo_it != bs->used_clean_objects.end();
|
bool used = uo_it != bs->used_clean_objects.end();
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("%s block %lu from %lx:%lx v%lu (delete)\n",
|
printf("%s block %ju from %jx:%jx v%ju (delete)\n",
|
||||||
used ? "Postpone free" : "Free",
|
used ? "Postpone free" : "Free",
|
||||||
old_clean_loc >> bs->dsk.block_order,
|
old_clean_loc >> bs->dsk.block_order,
|
||||||
cur.oid.inode, cur.oid.stripe, cur.version);
|
cur.oid.inode, cur.oid.stripe, cur.version);
|
||||||
|
@ -1346,7 +1347,6 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||||
else if (wait_state == wait_base+2) goto resume_2;
|
else if (wait_state == wait_base+2) goto resume_2;
|
||||||
else if (wait_state == wait_base+3) goto resume_3;
|
else if (wait_state == wait_base+3) goto resume_3;
|
||||||
else if (wait_state == wait_base+4) goto resume_4;
|
else if (wait_state == wait_base+4) goto resume_4;
|
||||||
flusher->journal_trim_counter = 0;
|
|
||||||
new_trim_pos = bs->journal.get_trim_pos();
|
new_trim_pos = bs->journal.get_trim_pos();
|
||||||
if (new_trim_pos != bs->journal.used_start)
|
if (new_trim_pos != bs->journal.used_start)
|
||||||
{
|
{
|
||||||
|
@ -1378,7 +1378,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||||
.csum_block_size = bs->dsk.csum_block_size,
|
.csum_block_size = bs->dsk.csum_block_size,
|
||||||
};
|
};
|
||||||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
||||||
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
|
data->iov = (struct iovec){ flusher->journal_superblock, (size_t)bs->dsk.journal_block_size };
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
|
@ -1410,7 +1410,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||||
}
|
}
|
||||||
bs->journal.used_start = new_trim_pos;
|
bs->journal.used_start = new_trim_pos;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Journal trimmed to %08lx (next_free=%08lx dirty_start=%08lx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
|
printf("Journal trimmed to %08jx (next_free=%08jx dirty_start=%08jx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
|
||||||
#endif
|
#endif
|
||||||
if (bs->journal.flush_journal && !flusher->flush_queue.size())
|
if (bs->journal.flush_journal && !flusher->flush_queue.size())
|
||||||
{
|
{
|
||||||
|
@ -1419,6 +1419,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
flusher->journal_trim_counter = 0;
|
||||||
flusher->trimming = false;
|
flusher->trimming = false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -107,7 +107,7 @@ class journal_flusher_t
|
||||||
blockstore_impl_t *bs;
|
blockstore_impl_t *bs;
|
||||||
friend class journal_flusher_co;
|
friend class journal_flusher_co;
|
||||||
|
|
||||||
int journal_trim_counter, journal_trim_interval;
|
int journal_trim_counter;
|
||||||
bool trimming;
|
bool trimming;
|
||||||
void* journal_superblock;
|
void* journal_superblock;
|
||||||
|
|
||||||
|
|
|
@ -195,6 +195,10 @@ void blockstore_impl_t::loop()
|
||||||
// ring is full, stop submission
|
// ring is full, stop submission
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||||
|
{
|
||||||
|
PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (op_idx != new_idx)
|
if (op_idx != new_idx)
|
||||||
|
@ -265,7 +269,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
// stop submission if there's still no free space
|
// stop submission if there's still no free space
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
|
printf("Still waiting for %ju SQE(s)\n", PRIV(op)->wait_detail);
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -273,15 +277,15 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||||
{
|
{
|
||||||
if (journal.used_start == PRIV(op)->wait_detail && !unstable_count_changed)
|
if (journal.used_start == PRIV(op)->wait_detail &&
|
||||||
|
(unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
|
||||||
{
|
{
|
||||||
// do not submit
|
// do not submit
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
|
printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
unstable_count_changed = false;
|
|
||||||
flusher->release_trim();
|
flusher->release_trim();
|
||||||
PRIV(op)->wait_for = 0;
|
PRIV(op)->wait_for = 0;
|
||||||
}
|
}
|
||||||
|
@ -353,7 +357,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
unstable_writes.clear();
|
unstable_writes.clear();
|
||||||
unstable_count_changed = true;
|
|
||||||
op->callback = [old_callback](blockstore_op_t *op)
|
op->callback = [old_callback](blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
obj_ver_id *vers = (obj_ver_id*)op->buf;
|
obj_ver_id *vers = (obj_ver_id*)op->buf;
|
||||||
|
|
|
@ -202,7 +202,7 @@ struct blockstore_op_private_t
|
||||||
{
|
{
|
||||||
// Wait status
|
// Wait status
|
||||||
int wait_for;
|
int wait_for;
|
||||||
uint64_t wait_detail;
|
uint64_t wait_detail, wait_detail2;
|
||||||
int pending_ops;
|
int pending_ops;
|
||||||
int op_state;
|
int op_state;
|
||||||
|
|
||||||
|
@ -253,6 +253,7 @@ class blockstore_impl_t
|
||||||
bool inmemory_meta = false;
|
bool inmemory_meta = false;
|
||||||
// Maximum and minimum flusher count
|
// Maximum and minimum flusher count
|
||||||
unsigned max_flusher_count, min_flusher_count;
|
unsigned max_flusher_count, min_flusher_count;
|
||||||
|
unsigned journal_trim_interval;
|
||||||
// Maximum queue depth
|
// Maximum queue depth
|
||||||
unsigned max_write_iodepth = 128;
|
unsigned max_write_iodepth = 128;
|
||||||
// Enable small (journaled) write throttling, useful for the SSD+HDD case
|
// Enable small (journaled) write throttling, useful for the SSD+HDD case
|
||||||
|
@ -276,7 +277,6 @@ class blockstore_impl_t
|
||||||
std::vector<blockstore_op_t*> submit_queue;
|
std::vector<blockstore_op_t*> submit_queue;
|
||||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||||
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
||||||
bool unstable_count_changed = false;
|
|
||||||
int unsynced_queued_ops = 0;
|
int unsynced_queued_ops = 0;
|
||||||
allocator *data_alloc = NULL;
|
allocator *data_alloc = NULL;
|
||||||
uint64_t used_blocks = 0;
|
uint64_t used_blocks = 0;
|
||||||
|
|
|
@ -63,7 +63,7 @@ int blockstore_init_meta::loop()
|
||||||
throw std::runtime_error("Failed to allocate metadata read buffer");
|
throw std::runtime_error("Failed to allocate metadata read buffer");
|
||||||
// Read superblock
|
// Read superblock
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
||||||
bs->ringloop->submit();
|
bs->ringloop->submit();
|
||||||
|
@ -100,7 +100,7 @@ resume_1:
|
||||||
{
|
{
|
||||||
printf("Initializing metadata area\n");
|
printf("Initializing metadata area\n");
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = (struct iovec){ metadata_buffer, bs->dsk.meta_block_size };
|
data->iov = (struct iovec){ metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
||||||
bs->ringloop->submit();
|
bs->ringloop->submit();
|
||||||
|
@ -153,7 +153,7 @@ resume_1:
|
||||||
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
|
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Metadata format is too new for me (stored version is %lu, max supported %u).\n",
|
"Metadata format is too new for me (stored version is %ju, max supported %u).\n",
|
||||||
hdr->version, BLOCKSTORE_META_FORMAT_V2
|
hdr->version, BLOCKSTORE_META_FORMAT_V2
|
||||||
);
|
);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -167,7 +167,7 @@ resume_1:
|
||||||
printf(
|
printf(
|
||||||
"Configuration stored in metadata superblock"
|
"Configuration stored in metadata superblock"
|
||||||
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u, data_csum_type=%u, csum_block_size=%u)"
|
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u, data_csum_type=%u, csum_block_size=%u)"
|
||||||
" differs from OSD configuration (%lu/%u/%lu, %u/%u).\n",
|
" differs from OSD configuration (%ju/%u/%ju, %u/%u).\n",
|
||||||
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
|
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
|
||||||
hdr->data_csum_type, hdr->csum_block_size,
|
hdr->data_csum_type, hdr->csum_block_size,
|
||||||
bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity,
|
bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity,
|
||||||
|
@ -199,7 +199,8 @@ resume_2:
|
||||||
submitted++;
|
submitted++;
|
||||||
next_offset += bufs[i].size;
|
next_offset += bufs[i].size;
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { bufs[i].buf, bufs[i].size };
|
assert(bufs[i].size <= 0x7fffffff);
|
||||||
|
data->iov = { bufs[i].buf, (size_t)bufs[i].size };
|
||||||
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
||||||
if (!zero_on_init)
|
if (!zero_on_init)
|
||||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
||||||
|
@ -231,7 +232,8 @@ resume_2:
|
||||||
{
|
{
|
||||||
// write the modified buffer back
|
// write the modified buffer back
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { bufs[i].buf, bufs[i].size };
|
assert(bufs[i].size <= 0x7fffffff);
|
||||||
|
data->iov = { bufs[i].buf, (size_t)bufs[i].size };
|
||||||
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
||||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
||||||
bufs[i].state = INIT_META_WRITING;
|
bufs[i].state = INIT_META_WRITING;
|
||||||
|
@ -257,7 +259,7 @@ resume_2:
|
||||||
next_offset = entries_to_zero[i]/entries_per_block;
|
next_offset = entries_to_zero[i]/entries_per_block;
|
||||||
for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
|
for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||||
submitted++;
|
submitted++;
|
||||||
|
@ -273,7 +275,7 @@ resume_5:
|
||||||
memset((uint8_t*)metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
memset((uint8_t*)metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||||
}
|
}
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||||
submitted++;
|
submitted++;
|
||||||
|
@ -287,7 +289,7 @@ resume_6:
|
||||||
entries_to_zero.clear();
|
entries_to_zero.clear();
|
||||||
}
|
}
|
||||||
// metadata read finished
|
// metadata read finished
|
||||||
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
|
printf("Metadata entries loaded: %ju, free blocks: %ju / %ju\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
|
||||||
if (!bs->inmemory_meta)
|
if (!bs->inmemory_meta)
|
||||||
{
|
{
|
||||||
free(metadata_buffer);
|
free(metadata_buffer);
|
||||||
|
@ -328,7 +330,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||||
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
|
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
|
||||||
if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
|
if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
|
||||||
{
|
{
|
||||||
printf("Metadata entry %lu is corrupt (checksum mismatch), skipping\n", done_cnt+i);
|
printf("Metadata entry %ju is corrupt (checksum mismatch), skipping\n", done_cnt+i);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -366,7 +368,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||||
entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
|
entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
|
||||||
}
|
}
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
printf("Free block %ju from %jx:%jx v%ju (new location is %ju)\n",
|
||||||
old_clean_loc,
|
old_clean_loc,
|
||||||
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
||||||
done_cnt+i);
|
done_cnt+i);
|
||||||
|
@ -380,7 +382,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||||
}
|
}
|
||||||
entries_loaded++;
|
entries_loaded++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
printf("Allocate block (clean entry) %ju: %jx:%jx v%ju\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||||
#endif
|
#endif
|
||||||
bs->data_alloc->set(done_cnt+i, true);
|
bs->data_alloc->set(done_cnt+i, true);
|
||||||
clean_db[entry->oid] = (struct clean_entry){
|
clean_db[entry->oid] = (struct clean_entry){
|
||||||
|
@ -394,7 +396,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||||
updated = true;
|
updated = true;
|
||||||
memset(entry, 0, bs->dsk.clean_entry_size);
|
memset(entry, 0, bs->dsk.clean_entry_size);
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Old clean entry %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
printf("Old clean entry %ju: %jx:%jx v%ju\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -466,7 +468,7 @@ int blockstore_init_journal::loop()
|
||||||
if (!sqe)
|
if (!sqe)
|
||||||
throw std::runtime_error("io_uring is full while trying to read journal");
|
throw std::runtime_error("io_uring is full while trying to read journal");
|
||||||
data = ((ring_data_t*)sqe->user_data);
|
data = ((ring_data_t*)sqe->user_data);
|
||||||
data->iov = { submitted_buf, bs->journal.block_size };
|
data->iov = { submitted_buf, (size_t)bs->journal.block_size };
|
||||||
data->callback = simple_callback;
|
data->callback = simple_callback;
|
||||||
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||||
bs->ringloop->submit();
|
bs->ringloop->submit();
|
||||||
|
@ -507,7 +509,7 @@ resume_1:
|
||||||
// FIXME: Randomize initial crc32. Track crc32 when trimming.
|
// FIXME: Randomize initial crc32. Track crc32 when trimming.
|
||||||
printf("Resetting journal\n");
|
printf("Resetting journal\n");
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
|
data->iov = (struct iovec){ submitted_buf, (size_t)(2*bs->journal.block_size) };
|
||||||
data->callback = simple_callback;
|
data->callback = simple_callback;
|
||||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
|
@ -557,7 +559,7 @@ resume_1:
|
||||||
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
|
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
|
stderr, "The code only supports journal versions 2 and 1, but it is %ju on disk."
|
||||||
" Please use vitastor-disk to rewrite the journal\n",
|
" Please use vitastor-disk to rewrite the journal\n",
|
||||||
je_start->size == JE_START_V0_SIZE ? 0 : je_start->version
|
je_start->size == JE_START_V0_SIZE ? 0 : je_start->version
|
||||||
);
|
);
|
||||||
|
@ -606,7 +608,7 @@ resume_1:
|
||||||
submitted_buf = (uint8_t*)bs->journal.buffer + journal_pos;
|
submitted_buf = (uint8_t*)bs->journal.buffer + journal_pos;
|
||||||
data->iov = {
|
data->iov = {
|
||||||
submitted_buf,
|
submitted_buf,
|
||||||
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
(size_t)(end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE),
|
||||||
};
|
};
|
||||||
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
|
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
|
||||||
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
|
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
|
||||||
|
@ -622,7 +624,7 @@ resume_1:
|
||||||
if (init_write_buf && !bs->readonly)
|
if (init_write_buf && !bs->readonly)
|
||||||
{
|
{
|
||||||
GET_SQE();
|
GET_SQE();
|
||||||
data->iov = { init_write_buf, bs->journal.block_size };
|
data->iov = { init_write_buf, (size_t)bs->journal.block_size };
|
||||||
data->callback = simple_callback;
|
data->callback = simple_callback;
|
||||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + init_write_sector);
|
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + init_write_sector);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
|
@ -691,7 +693,7 @@ resume_1:
|
||||||
IS_BIG_WRITE(dirty_it->second.state) &&
|
IS_BIG_WRITE(dirty_it->second.state) &&
|
||||||
dirty_it->second.location == UINT64_MAX)
|
dirty_it->second.location == UINT64_MAX)
|
||||||
{
|
{
|
||||||
printf("Fatal error (bug): %lx:%lx v%lu big_write journal_entry was allocated over another object\n",
|
printf("Fatal error (bug): %jx:%jx v%ju big_write journal_entry was allocated over another object\n",
|
||||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
@ -699,7 +701,7 @@ resume_1:
|
||||||
bs->flusher->mark_trim_possible();
|
bs->flusher->mark_trim_possible();
|
||||||
bs->journal.dirty_start = bs->journal.next_free;
|
bs->journal.dirty_start = bs->journal.next_free;
|
||||||
printf(
|
printf(
|
||||||
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
|
"Journal entries loaded: %ju, free journal space: %ju bytes (%08jx..%08jx is used), free blocks: %ju / %ju\n",
|
||||||
entries_loaded,
|
entries_loaded,
|
||||||
(bs->journal.next_free >= bs->journal.used_start
|
(bs->journal.next_free >= bs->journal.used_start
|
||||||
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
|
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
|
||||||
|
@ -754,7 +756,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u\n",
|
"je_small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u\n",
|
||||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||||
je->small_write.offset, je->small_write.len
|
je->small_write.offset, je->small_write.len
|
||||||
|
@ -776,7 +778,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
if (location != je->small_write.data_offset)
|
if (location != je->small_write.data_offset)
|
||||||
{
|
{
|
||||||
char err[1024];
|
char err[1024];
|
||||||
snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
|
snprintf(err, 1024, "BUG: calculated journal data offset (%08jx) != stored journal data offset (%08jx)", location, je->small_write.data_offset);
|
||||||
throw std::runtime_error(err);
|
throw std::runtime_error(err);
|
||||||
}
|
}
|
||||||
small_write_data.clear();
|
small_write_data.clear();
|
||||||
|
@ -803,7 +805,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
covered += part_end - part_begin;
|
covered += part_end - part_begin;
|
||||||
small_write_data.push_back((iovec){
|
small_write_data.push_back((iovec){
|
||||||
.iov_base = (uint8_t*)done[i].buf + part_begin - done[i].pos,
|
.iov_base = (uint8_t*)done[i].buf + part_begin - done[i].pos,
|
||||||
.iov_len = part_end - part_begin,
|
.iov_len = (size_t)(part_end - part_begin),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -826,7 +828,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
if (!data_csum_valid)
|
if (!data_csum_valid)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - data crc32 %x != %x\n",
|
"Journal entry data is corrupt for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - data crc32 %x != %x\n",
|
||||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||||
je->small_write.offset, je->small_write.len,
|
je->small_write.offset, je->small_write.len,
|
||||||
|
@ -845,7 +847,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
if (je->size != required_size)
|
if (je->size != required_size)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Journal entry data has invalid size for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - should be %u bytes but is %u bytes\n",
|
"Journal entry data has invalid size for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - should be %u bytes but is %u bytes\n",
|
||||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||||
je->small_write.offset, je->small_write.len,
|
je->small_write.offset, je->small_write.len,
|
||||||
|
@ -893,7 +895,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
if (block_crc32 != *block_csums)
|
if (block_crc32 != *block_csums)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - block %u crc32 %x != %x\n",
|
"Journal entry data is corrupt for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - block %u crc32 %x != %x\n",
|
||||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||||
je->small_write.offset, je->small_write.len,
|
je->small_write.offset, je->small_write.len,
|
||||||
|
@ -956,7 +958,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
bs->journal.used_sectors[proc_pos]++;
|
bs->journal.used_sectors[proc_pos]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||||
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
|
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
@ -972,7 +974,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
|
"je_big_write%s oid=%jx:%jx ver=%ju loc=%ju\n",
|
||||||
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
|
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
|
||||||
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->dsk.block_order
|
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->dsk.block_order
|
||||||
);
|
);
|
||||||
|
@ -1049,7 +1051,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"Allocate block (journal) %lu: %lx:%lx v%lu\n",
|
"Allocate block (journal) %ju: %jx:%jx v%ju\n",
|
||||||
je->big_write.location >> bs->dsk.block_order,
|
je->big_write.location >> bs->dsk.block_order,
|
||||||
ov.oid.inode, ov.oid.stripe, ov.version
|
ov.oid.inode, ov.oid.stripe, ov.version
|
||||||
);
|
);
|
||||||
|
@ -1059,7 +1061,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
bs->journal.used_sectors[proc_pos]++;
|
bs->journal.used_sectors[proc_pos]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||||
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
|
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
@ -1074,7 +1076,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
else if (je->type == JE_STABLE)
|
else if (je->type == JE_STABLE)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
|
printf("je_stable oid=%jx:%jx ver=%ju\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
|
||||||
#endif
|
#endif
|
||||||
// oid, version
|
// oid, version
|
||||||
obj_ver_id ov = {
|
obj_ver_id ov = {
|
||||||
|
@ -1086,7 +1088,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
else if (je->type == JE_ROLLBACK)
|
else if (je->type == JE_ROLLBACK)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
|
printf("je_rollback oid=%jx:%jx ver=%ju\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
|
||||||
#endif
|
#endif
|
||||||
// rollback dirty writes of <oid> up to <version>
|
// rollback dirty writes of <oid> up to <version>
|
||||||
obj_ver_id ov = {
|
obj_ver_id ov = {
|
||||||
|
@ -1098,7 +1100,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
else if (je->type == JE_DELETE)
|
else if (je->type == JE_DELETE)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
printf("je_delete oid=%jx:%jx ver=%ju\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
||||||
#endif
|
#endif
|
||||||
bool dirty_exists = false;
|
bool dirty_exists = false;
|
||||||
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
||||||
|
|
|
@ -90,8 +90,8 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
||||||
}
|
}
|
||||||
// In fact, it's even more rare than "ran out of journal space", so print a warning
|
// In fact, it's even more rare than "ran out of journal space", so print a warning
|
||||||
printf(
|
printf(
|
||||||
"Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)"
|
"Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
|
||||||
" is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
|
" is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
|
||||||
used, bs->journal.sector_count, dirty, next_sector,
|
used, bs->journal.sector_count, dirty, next_sector,
|
||||||
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
|
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
|
||||||
bs->journal.sector_info[next_sector].flush_count
|
bs->journal.sector_info[next_sector].flush_count
|
||||||
|
@ -103,7 +103,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
||||||
if (data_after > 0)
|
if (data_after > 0)
|
||||||
{
|
{
|
||||||
next_pos = next_pos + data_after;
|
next_pos = next_pos + data_after;
|
||||||
if (next_pos > bs->journal.len)
|
if (next_pos >= bs->journal.len)
|
||||||
{
|
{
|
||||||
if (right_dir)
|
if (right_dir)
|
||||||
next_pos = bs->journal.block_size + data_after;
|
next_pos = bs->journal.block_size + data_after;
|
||||||
|
@ -114,7 +114,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
||||||
{
|
{
|
||||||
// No space in the journal. Wait until used_start changes.
|
// No space in the journal. Wait until used_start changes.
|
||||||
printf(
|
printf(
|
||||||
"Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n",
|
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
||||||
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
||||||
);
|
);
|
||||||
PRIV(op)->wait_for = WAIT_JOURNAL;
|
PRIV(op)->wait_for = WAIT_JOURNAL;
|
||||||
|
@ -146,7 +146,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
|
||||||
journal.in_sector_pos = 0;
|
journal.in_sector_pos = 0;
|
||||||
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
||||||
// double check that next_free doesn't cross used_start from the left
|
// double check that next_free doesn't cross used_start from the left
|
||||||
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||||
journal.next_free = next_next_free;
|
journal.next_free = next_next_free;
|
||||||
memset(journal.inmemory
|
memset(journal.inmemory
|
||||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||||
|
@ -183,7 +183,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
|
||||||
(journal.inmemory
|
(journal.inmemory
|
||||||
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
|
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
|
||||||
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
|
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
|
||||||
journal.block_size
|
(size_t)journal.block_size
|
||||||
};
|
};
|
||||||
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
|
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
|
||||||
my_uring_prep_writev(
|
my_uring_prep_writev(
|
||||||
|
@ -263,7 +263,7 @@ uint64_t journal_t::get_trim_pos()
|
||||||
// next_free does not need updating during trim
|
// next_free does not need updating during trim
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||||
used_start, next_free, dirty_start,
|
used_start, next_free, dirty_start,
|
||||||
journal_used_it->first, journal_used_it->second
|
journal_used_it->first, journal_used_it->second
|
||||||
);
|
);
|
||||||
|
@ -276,7 +276,7 @@ uint64_t journal_t::get_trim_pos()
|
||||||
// Journal is cleared up to <journal_used_it>
|
// Journal is cleared up to <journal_used_it>
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||||
used_start, next_free, dirty_start,
|
used_start, next_free, dirty_start,
|
||||||
journal_used_it->first, journal_used_it->second
|
journal_used_it->first, journal_used_it->second
|
||||||
);
|
);
|
||||||
|
@ -296,7 +296,7 @@ void journal_t::dump_diagnostics()
|
||||||
journal_used_it = used_sectors.begin();
|
journal_used_it = used_sectors.begin();
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"Journal: used_start=%08lx next_free=%08lx dirty_start=%08lx trim_to=%08lx trim_to_refs=%ld\n",
|
"Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
|
||||||
used_start, next_free, dirty_start,
|
used_start, next_free, dirty_start,
|
||||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
||||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
||||||
|
|
|
@ -13,6 +13,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||||
}
|
}
|
||||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||||
|
journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
|
||||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||||
|
@ -31,6 +32,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||||
{
|
{
|
||||||
min_flusher_count = 1;
|
min_flusher_count = 1;
|
||||||
}
|
}
|
||||||
|
if (!journal_trim_interval)
|
||||||
|
{
|
||||||
|
journal_trim_interval = 512;
|
||||||
|
}
|
||||||
if (!max_write_iodepth)
|
if (!max_write_iodepth)
|
||||||
{
|
{
|
||||||
max_write_iodepth = 128;
|
max_write_iodepth = 128;
|
||||||
|
|
|
@ -25,7 +25,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
BS_SUBMIT_GET_SQE(sqe, data);
|
BS_SUBMIT_GET_SQE(sqe, data);
|
||||||
data->iov = (struct iovec){ buf, len };
|
data->iov = (struct iovec){ buf, (size_t)len };
|
||||||
PRIV(op)->pending_ops++;
|
PRIV(op)->pending_ops++;
|
||||||
my_uring_prep_readv(
|
my_uring_prep_readv(
|
||||||
sqe,
|
sqe,
|
||||||
|
@ -505,7 +505,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||||
for (auto & rv: PRIV(read_op)->read_vec)
|
for (auto & rv: PRIV(read_op)->read_vec)
|
||||||
{
|
{
|
||||||
if (rv.journal_sector)
|
if (rv.journal_sector)
|
||||||
journal.used_sectors[rv.journal_sector-1]++;
|
journal.used_sectors.at(rv.journal_sector-1)++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
read_op->retval = 0;
|
read_op->retval = 0;
|
||||||
|
@ -700,7 +700,7 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
|
||||||
.buf = buf,
|
.buf = buf,
|
||||||
});
|
});
|
||||||
BS_SUBMIT_GET_SQE(sqe, data);
|
BS_SUBMIT_GET_SQE(sqe, data);
|
||||||
data->iov = (struct iovec){ buf, dsk.meta_block_size };
|
data->iov = (struct iovec){ buf, (size_t)dsk.meta_block_size };
|
||||||
PRIV(op)->pending_ops++;
|
PRIV(op)->pending_ops++;
|
||||||
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
|
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
|
||||||
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
||||||
|
@ -855,7 +855,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||||
{
|
{
|
||||||
ok = false;
|
ok = false;
|
||||||
printf(
|
printf(
|
||||||
"Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
|
"Checksum mismatch in object %jx:%jx v%ju in journal at 0x%jx, checksum block #%u: got %08x, expected %08x\n",
|
||||||
op->oid.inode, op->oid.stripe, op->version,
|
op->oid.inode, op->oid.stripe, op->version,
|
||||||
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||||
);
|
);
|
||||||
|
@ -875,7 +875,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||||
{
|
{
|
||||||
ok = false;
|
ok = false;
|
||||||
printf(
|
printf(
|
||||||
"Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
|
"Checksum mismatch in object %jx:%jx v%ju in %s data at 0x%jx, checksum block #%u: got %08x, expected %08x\n",
|
||||||
op->oid.inode, op->oid.stripe, op->version,
|
op->oid.inode, op->oid.stripe, op->version,
|
||||||
(rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
|
(rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
|
||||||
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||||
|
@ -918,7 +918,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||||
{
|
{
|
||||||
// checksum error
|
// checksum error
|
||||||
printf(
|
printf(
|
||||||
"Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n",
|
"Checksum mismatch in object %jx:%jx v%ju in %s area at offset 0x%jx+0x%zx: %08x vs %08x\n",
|
||||||
op->oid.inode, op->oid.stripe, op->version,
|
op->oid.inode, op->oid.stripe, op->version,
|
||||||
(vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
|
(vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
|
||||||
crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
|
crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
|
||||||
|
@ -966,7 +966,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||||
{
|
{
|
||||||
if (rv.journal_sector)
|
if (rv.journal_sector)
|
||||||
{
|
{
|
||||||
auto used = --journal.used_sectors[rv.journal_sector-1];
|
auto used = --journal.used_sectors.at(rv.journal_sector-1);
|
||||||
if (used == 0)
|
if (used == 0)
|
||||||
{
|
{
|
||||||
journal.used_sectors.erase(rv.journal_sector-1);
|
journal.used_sectors.erase(rv.journal_sector-1);
|
||||||
|
|
|
@ -162,7 +162,6 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
||||||
unstable_writes.erase(unstab_it);
|
unstable_writes.erase(unstab_it);
|
||||||
else
|
else
|
||||||
unstab_it->second = max_unstable;
|
unstab_it->second = max_unstable;
|
||||||
unstable_count_changed = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -180,7 +179,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||||
{
|
{
|
||||||
object_id oid = dirty_it->first.oid;
|
object_id oid = dirty_it->first.oid;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Unblock writes-after-delete %lx:%lx v%lu\n", oid.inode, oid.stripe, dirty_it->first.version);
|
printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||||
#endif
|
#endif
|
||||||
dirty_it = dirty_end;
|
dirty_it = dirty_end;
|
||||||
// Unblock operations blocked by delete flushing
|
// Unblock operations blocked by delete flushing
|
||||||
|
@ -211,21 +210,26 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||||
dirty_it->second.location != UINT64_MAX)
|
dirty_it->second.location != UINT64_MAX)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> dsk.block_order,
|
printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
|
||||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||||
#endif
|
#endif
|
||||||
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
|
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
|
||||||
}
|
}
|
||||||
auto used = --journal.used_sectors[dirty_it->second.journal_sector];
|
auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"remove usage of journal offset %08lx by %lx:%lx v%lu (%lu refs)\n", dirty_it->second.journal_sector,
|
"remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
|
||||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
if (used == 0)
|
if (used == 0)
|
||||||
{
|
{
|
||||||
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||||
|
if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
|
||||||
|
{
|
||||||
|
// Mark current sector as "full" to select the new one
|
||||||
|
journal.in_sector_pos = dsk.journal_block_size;
|
||||||
|
}
|
||||||
flusher->mark_trim_possible();
|
flusher->mark_trim_possible();
|
||||||
}
|
}
|
||||||
free_dirty_dyn_data(dirty_it->second);
|
free_dirty_dyn_data(dirty_it->second);
|
||||||
|
|
|
@ -298,7 +298,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
||||||
{
|
{
|
||||||
// No such object version
|
// No such object version
|
||||||
printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
printf("Error: %jx:%jx v%ju not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -307,35 +307,49 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
return STAB_SPLIT_DONE;
|
return STAB_SPLIT_DONE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
|
||||||
{
|
|
||||||
// Object write is still in progress. Wait until the write request completes
|
|
||||||
return STAB_SPLIT_WAIT;
|
|
||||||
}
|
|
||||||
else if (!IS_SYNCED(dirty_it->second.state))
|
|
||||||
{
|
|
||||||
// Object not synced yet - sync it
|
|
||||||
// In previous versions we returned EBUSY here and required
|
|
||||||
// the caller (OSD) to issue a global sync first. But a global sync
|
|
||||||
// waits for all writes in the queue including inflight writes. And
|
|
||||||
// inflight writes may themselves be blocked by unstable writes being
|
|
||||||
// still present in the journal and not flushed away from it.
|
|
||||||
// So we must sync specific objects here.
|
|
||||||
//
|
|
||||||
// Even more, we have to process "stabilize" request in parts. That is,
|
|
||||||
// we must stabilize all objects which are already synced. Otherwise
|
|
||||||
// they may block objects which are NOT synced yet.
|
|
||||||
return STAB_SPLIT_SYNC;
|
|
||||||
}
|
|
||||||
else if (IS_STABLE(dirty_it->second.state))
|
else if (IS_STABLE(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Already stable
|
// Already stable
|
||||||
return STAB_SPLIT_DONE;
|
return STAB_SPLIT_DONE;
|
||||||
}
|
}
|
||||||
else
|
while (true)
|
||||||
{
|
{
|
||||||
return STAB_SPLIT_TODO;
|
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
// Object write is still in progress. Wait until the write request completes
|
||||||
|
return STAB_SPLIT_WAIT;
|
||||||
|
}
|
||||||
|
else if (!IS_SYNCED(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
// Object not synced yet - sync it
|
||||||
|
// In previous versions we returned EBUSY here and required
|
||||||
|
// the caller (OSD) to issue a global sync first. But a global sync
|
||||||
|
// waits for all writes in the queue including inflight writes. And
|
||||||
|
// inflight writes may themselves be blocked by unstable writes being
|
||||||
|
// still present in the journal and not flushed away from it.
|
||||||
|
// So we must sync specific objects here.
|
||||||
|
//
|
||||||
|
// Even more, we have to process "stabilize" request in parts. That is,
|
||||||
|
// we must stabilize all objects which are already synced. Otherwise
|
||||||
|
// they may block objects which are NOT synced yet.
|
||||||
|
return STAB_SPLIT_SYNC;
|
||||||
|
}
|
||||||
|
else if (IS_STABLE(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Check previous versions too
|
||||||
|
if (dirty_it == dirty_db.begin())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid != ov.oid)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return STAB_SPLIT_TODO;
|
||||||
});
|
});
|
||||||
if (r != 1)
|
if (r != 1)
|
||||||
{
|
{
|
||||||
|
@ -402,7 +416,7 @@ resume_4:
|
||||||
{
|
{
|
||||||
// Mark all dirty_db entries up to op->version as stable
|
// Mark all dirty_db entries up to op->version as stable
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Stabilize %lx:%lx v%lu\n", v->oid.inode, v->oid.stripe, v->version);
|
printf("Stabilize %jx:%jx v%ju\n", v->oid.inode, v->oid.stripe, v->version);
|
||||||
#endif
|
#endif
|
||||||
mark_stable(*v);
|
mark_stable(*v);
|
||||||
}
|
}
|
||||||
|
@ -493,7 +507,7 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
|
||||||
{
|
{
|
||||||
// mark_stable should never be called for in-flight or submitted writes
|
// mark_stable should never be called for in-flight or submitted writes
|
||||||
printf(
|
printf(
|
||||||
"BUG: Attempt to mark_stable object %lx:%lx v%lu state of which is %x\n",
|
"BUG: Attempt to mark_stable object %jx:%jx v%ju state of which is %x\n",
|
||||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||||
dirty_it->second.state
|
dirty_it->second.state
|
||||||
);
|
);
|
||||||
|
@ -537,6 +551,5 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
|
||||||
unstab_it->second <= v.version)
|
unstab_it->second <= v.version)
|
||||||
{
|
{
|
||||||
unstable_writes.erase(unstab_it);
|
unstable_writes.erase(unstab_it);
|
||||||
unstable_count_changed = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,8 +92,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -116,11 +115,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||||
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||||
sizeof(journal_entry_big_write) + dyn_size
|
sizeof(journal_entry_big_write) + dyn_size
|
||||||
);
|
);
|
||||||
dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||||
|
assert(journal.next_free >= journal.used_start
|
||||||
|
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||||
|
: (jsec >= journal.used_start || jsec < journal.next_free));
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||||
dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
|
dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||||
);
|
);
|
||||||
|
@ -174,7 +176,7 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||||
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
|
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Ack sync big %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
|
printf("Ack sync big %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||||
#endif
|
#endif
|
||||||
auto & unstab = unstable_writes[it->oid];
|
auto & unstab = unstable_writes[it->oid];
|
||||||
unstab = unstab < it->version ? it->version : unstab;
|
unstab = unstab < it->version ? it->version : unstab;
|
||||||
|
@ -202,7 +204,7 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||||
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Ack sync small %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
|
printf("Ack sync small %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||||
#endif
|
#endif
|
||||||
auto & unstab = unstable_writes[it->oid];
|
auto & unstab = unstable_writes[it->oid];
|
||||||
unstab = unstab < it->version ? it->version : unstab;
|
unstab = unstab < it->version ? it->version : unstab;
|
||||||
|
|
|
@ -85,7 +85,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||||
// It's allowed to write versions with low numbers over deletes
|
// It's allowed to write versions with low numbers over deletes
|
||||||
// However, we have to flush those deletes first as we use version number for ordering
|
// However, we have to flush those deletes first as we use version number for ordering
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
|
printf("Write %jx:%jx v%ju over delete (real v%ju) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
|
||||||
#endif
|
#endif
|
||||||
wait_del = true;
|
wait_del = true;
|
||||||
PRIV(op)->real_version = op->version;
|
PRIV(op)->real_version = op->version;
|
||||||
|
@ -95,11 +95,13 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||||
// Issue an additional sync so the delete reaches the journal
|
// Issue an additional sync so the delete reaches the journal
|
||||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||||
sync_op->opcode = BS_OP_SYNC;
|
sync_op->opcode = BS_OP_SYNC;
|
||||||
sync_op->callback = [this, op](blockstore_op_t *sync_op)
|
sync_op->oid = op->oid;
|
||||||
|
sync_op->version = op->version;
|
||||||
|
sync_op->callback = [this](blockstore_op_t *sync_op)
|
||||||
{
|
{
|
||||||
flusher->unshift_flush((obj_ver_id){
|
flusher->unshift_flush((obj_ver_id){
|
||||||
.oid = op->oid,
|
.oid = sync_op->oid,
|
||||||
.version = op->version-1,
|
.version = sync_op->version-1,
|
||||||
}, true);
|
}, true);
|
||||||
delete sync_op;
|
delete sync_op;
|
||||||
};
|
};
|
||||||
|
@ -117,7 +119,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
// Invalid version requested
|
// Invalid version requested
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
|
printf("Write %jx:%jx v%ju requested, but we already have v%ju\n", op->oid.inode, op->oid.stripe, op->version, version);
|
||||||
#endif
|
#endif
|
||||||
op->retval = -EEXIST;
|
op->retval = -EEXIST;
|
||||||
if (!is_del && alloc_dyn_data)
|
if (!is_del && alloc_dyn_data)
|
||||||
|
@ -144,9 +146,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||||
unsynced_queued_ops++;
|
unsynced_queued_ops++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
if (is_del)
|
if (is_del)
|
||||||
printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
|
printf("Delete %jx:%jx v%ju\n", op->oid.inode, op->oid.stripe, op->version);
|
||||||
else if (!wait_del)
|
else if (!wait_del)
|
||||||
printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
|
printf("Write %jx:%jx v%ju offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
|
||||||
#endif
|
#endif
|
||||||
// No strict need to add it into dirty_db here except maybe for listings to return
|
// No strict need to add it into dirty_db here except maybe for listings to return
|
||||||
// correct data when there are inflight operations in the queue
|
// correct data when there are inflight operations in the queue
|
||||||
|
@ -286,7 +288,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
// Restore original low version number for unblocked operations
|
// Restore original low version number for unblocked operations
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
|
printf("Restoring %jx:%jx version: v%ju -> v%ju\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
|
||||||
#endif
|
#endif
|
||||||
auto prev_it = dirty_it;
|
auto prev_it = dirty_it;
|
||||||
if (prev_it != dirty_db.begin())
|
if (prev_it != dirty_db.begin())
|
||||||
|
@ -296,7 +298,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
// Original version is still invalid
|
// Original version is still invalid
|
||||||
// All subsequent writes to the same object must be canceled too
|
// All subsequent writes to the same object must be canceled too
|
||||||
printf("Tried to write %lx:%lx v%lu after delete (old version v%lu), but already have v%lu\n",
|
printf("Tried to write %jx:%jx v%ju after delete (old version v%ju), but already have v%ju\n",
|
||||||
op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
|
op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
|
||||||
cancel_all_writes(op, dirty_it, -EEXIST);
|
cancel_all_writes(op, dirty_it, -EEXIST);
|
||||||
return 2;
|
return 2;
|
||||||
|
@ -320,7 +322,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
(unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -348,8 +350,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
if (entry->oid.inode || entry->oid.stripe || entry->version)
|
if (entry->oid.inode || entry->oid.stripe || entry->version)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Fatal error (metadata corruption or bug): tried to write object %lx:%lx v%lu"
|
"Fatal error (metadata corruption or bug): tried to write object %jx:%jx v%ju"
|
||||||
" over a non-zero metadata entry %lu with %lx:%lx v%lu\n", op->oid.inode,
|
" over a non-zero metadata entry %ju with %jx:%jx v%ju\n", op->oid.inode,
|
||||||
op->oid.stripe, op->version, loc, entry->oid.inode, entry->oid.stripe, entry->version
|
op->oid.stripe, op->version, loc, entry->oid.inode, entry->oid.stripe, entry->version
|
||||||
);
|
);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -361,7 +363,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
|
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"Allocate block %lu for %lx:%lx v%lu\n",
|
"Allocate block %ju for %jx:%jx v%ju\n",
|
||||||
loc, op->oid.inode, op->oid.stripe, op->version
|
loc, op->oid.inode, op->oid.stripe, op->version
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
@ -372,13 +374,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
int vcnt = 0;
|
int vcnt = 0;
|
||||||
if (stripe_offset)
|
if (stripe_offset)
|
||||||
{
|
{
|
||||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset };
|
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, (size_t)stripe_offset };
|
||||||
}
|
}
|
||||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
|
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
|
||||||
if (stripe_end)
|
if (stripe_end)
|
||||||
{
|
{
|
||||||
stripe_end = dsk.bitmap_granularity - stripe_end;
|
stripe_end = dsk.bitmap_granularity - stripe_end;
|
||||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
|
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, (size_t)stripe_end };
|
||||||
}
|
}
|
||||||
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
|
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
|
||||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||||
|
@ -412,7 +414,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
||||||
|| !space_check.check_available(op, 1,
|
|| !space_check.check_available(op, 1,
|
||||||
sizeof(journal_entry_small_write) + dyn_size,
|
sizeof(journal_entry_small_write) + dyn_size,
|
||||||
op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
op->len + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -436,11 +438,23 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
|
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
|
||||||
sizeof(journal_entry_small_write) + dyn_size
|
sizeof(journal_entry_small_write) + dyn_size
|
||||||
);
|
);
|
||||||
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||||
|
if (!(journal.next_free >= journal.used_start
|
||||||
|
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||||
|
: (jsec >= journal.used_start || jsec < journal.next_free)))
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"BUG: journal offset %08jx is used by %jx:%jx v%ju (%ju refs) BUT used_start=%jx next_free=%jx\n",
|
||||||
|
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||||
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
|
||||||
|
journal.used_start, journal.next_free
|
||||||
|
);
|
||||||
|
abort();
|
||||||
|
}
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||||
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||||
);
|
);
|
||||||
|
@ -454,8 +468,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
journal_used_it->first < next_next_free + op->len)
|
journal_used_it->first < next_next_free + op->len)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"BUG: Attempt to overwrite used offset (%lx, %lu refs) of the journal with the object %lx:%lx v%lu: data at %lx, len %x!"
|
"BUG: Attempt to overwrite used offset (%jx, %ju refs) of the journal with the object %jx:%jx v%ju: data at %jx, len %x!"
|
||||||
" Journal used_start=%08lx (%lu refs), next_free=%08lx, dirty_start=%08lx\n",
|
" Journal used_start=%08jx (%ju refs), next_free=%08jx, dirty_start=%08jx\n",
|
||||||
journal_used_it->first, journal_used_it->second, op->oid.inode, op->oid.stripe, op->version, next_next_free, op->len,
|
journal_used_it->first, journal_used_it->second, op->oid.inode, op->oid.stripe, op->version, next_next_free, op->len,
|
||||||
journal.used_start, journal.used_sectors[journal.used_start], journal.next_free, journal.dirty_start
|
journal.used_start, journal.used_sectors[journal.used_start], journal.next_free, journal.dirty_start
|
||||||
);
|
);
|
||||||
|
@ -463,7 +477,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// double check that next_free doesn't cross used_start from the left
|
// double check that next_free doesn't cross used_start from the left
|
||||||
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||||
journal.next_free = next_next_free;
|
journal.next_free = next_next_free;
|
||||||
je->oid = op->oid;
|
je->oid = op->oid;
|
||||||
je->version = op->version;
|
je->version = op->version;
|
||||||
|
@ -505,7 +519,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
if (next_next_free >= journal.len)
|
if (next_next_free >= journal.len)
|
||||||
next_next_free = dsk.journal_block_size;
|
next_next_free = dsk.journal_block_size;
|
||||||
// double check that next_free doesn't cross used_start from the left
|
// double check that next_free doesn't cross used_start from the left
|
||||||
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||||
journal.next_free = next_next_free;
|
journal.next_free = next_next_free;
|
||||||
if (!(dirty_it->second.state & BS_ST_INSTANT))
|
if (!(dirty_it->second.state & BS_ST_INSTANT))
|
||||||
{
|
{
|
||||||
|
@ -549,7 +563,7 @@ resume_2:
|
||||||
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
(unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -558,11 +572,23 @@ resume_2:
|
||||||
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||||
sizeof(journal_entry_big_write) + dyn_size
|
sizeof(journal_entry_big_write) + dyn_size
|
||||||
);
|
);
|
||||||
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||||
|
if (!(journal.next_free >= journal.used_start
|
||||||
|
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||||
|
: (jsec >= journal.used_start || jsec < journal.next_free)))
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"BUG: journal offset %08jx is used by %jx:%jx v%ju (%ju refs) BUT used_start=%jx next_free=%jx\n",
|
||||||
|
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||||
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
|
||||||
|
journal.used_start, journal.next_free
|
||||||
|
);
|
||||||
|
abort();
|
||||||
|
}
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||||
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
|
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||||
);
|
);
|
||||||
|
@ -589,7 +615,7 @@ resume_4:
|
||||||
});
|
});
|
||||||
assert(dirty_it != dirty_db.end());
|
assert(dirty_it != dirty_db.end());
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
printf("Ack write %jx:%jx v%ju = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||||
#endif
|
#endif
|
||||||
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
||||||
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
||||||
|
@ -782,7 +808,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||||
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||||
);
|
);
|
||||||
|
|
|
@ -77,7 +77,7 @@ struct alloc_osd_t
|
||||||
std::string key = base64_decode(kv["key"].string_value());
|
std::string key = base64_decode(kv["key"].string_value());
|
||||||
osd_num_t cur_osd;
|
osd_num_t cur_osd;
|
||||||
char null_byte = 0;
|
char null_byte = 0;
|
||||||
int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%lu%c", &cur_osd, &null_byte);
|
int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%ju%c", &cur_osd, &null_byte);
|
||||||
if (scanned != 1 || !cur_osd)
|
if (scanned != 1 || !cur_osd)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid key in etcd: %s\n", key.c_str());
|
fprintf(stderr, "Invalid key in etcd: %s\n", key.c_str());
|
||||||
|
|
|
@ -11,7 +11,7 @@ void cli_tool_t::change_parent(inode_t cur, inode_t new_parent, cli_result_t *re
|
||||||
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[128];
|
char buf[128];
|
||||||
snprintf(buf, 128, "Inode 0x%lx disappeared", cur);
|
snprintf(buf, 128, "Inode 0x%jx disappeared", cur);
|
||||||
*result = (cli_result_t){ .err = EIO, .text = buf };
|
*result = (cli_result_t){ .err = EIO, .text = buf };
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -160,14 +160,14 @@ struct cli_describe_t
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n",
|
stderr, "Failed to describe objects on OSD %ju (retval=%jd)\n",
|
||||||
osd_num, op->reply.hdr.retval
|
osd_num, op->reply.hdr.retval
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Invalid response size from OSD %lu (expected %lu bytes, got %lu bytes)\n",
|
stderr, "Invalid response size from OSD %ju (expected %ju bytes, got %ju bytes)\n",
|
||||||
osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
|
osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -178,11 +178,11 @@ struct cli_describe_t
|
||||||
{
|
{
|
||||||
if (!parent->json_output || parent->is_command_line)
|
if (!parent->json_output || parent->is_command_line)
|
||||||
{
|
{
|
||||||
#define FMT "{\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"part\":%u,\"osd_num\":%lu%s%s%s}"
|
#define FMT "{\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"part\":%u,\"osd_num\":%ju%s%s%s}"
|
||||||
printf(
|
printf(
|
||||||
(parent->json_output
|
(parent->json_output
|
||||||
? (count > 0 ? ",\n " FMT : " " FMT)
|
? (count > 0 ? ",\n " FMT : " " FMT)
|
||||||
: "%lx:%lx part %u on OSD %lu%s%s%s\n"),
|
: "%jx:%jx part %u on OSD %ju%s%s%s\n"),
|
||||||
#undef FMT
|
#undef FMT
|
||||||
items[i].inode, items[i].stripe,
|
items[i].inode, items[i].stripe,
|
||||||
items[i].role, items[i].osd_num,
|
items[i].role, items[i].osd_num,
|
||||||
|
|
|
@ -82,7 +82,7 @@ resume_1:
|
||||||
// osd ID
|
// osd ID
|
||||||
osd_num_t osd_num;
|
osd_num_t osd_num;
|
||||||
char null_byte = 0;
|
char null_byte = 0;
|
||||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%lu%c", &osd_num, &null_byte);
|
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%ju%c", &osd_num, &null_byte);
|
||||||
if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
|
if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||||
|
|
|
@ -136,7 +136,7 @@ struct cli_fix_t
|
||||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
|
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
|
||||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Object %lx:%lx is from unknown pool\n", obj.inode, obj.stripe);
|
fprintf(stderr, "Object %jx:%jx is from unknown pool\n", obj.inode, obj.stripe);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto & pool_cfg = pool_cfg_it->second;
|
auto & pool_cfg = pool_cfg_it->second;
|
||||||
|
@ -146,7 +146,7 @@ struct cli_fix_t
|
||||||
!pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
|
!pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Object %lx:%lx is from PG %u/%u which is not currently active\n",
|
stderr, "Object %jx:%jx is from PG %u/%u which is not currently active\n",
|
||||||
obj.inode, obj.stripe, pool_cfg_it->first, pg_num
|
obj.inode, obj.stripe, pool_cfg_it->first, pg_num
|
||||||
);
|
);
|
||||||
continue;
|
continue;
|
||||||
|
@ -171,7 +171,7 @@ struct cli_fix_t
|
||||||
{
|
{
|
||||||
if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n", primary_osd, op->reply.hdr.retval);
|
fprintf(stderr, "Failed to describe objects on OSD %ju (retval=%jd)\n", primary_osd, op->reply.hdr.retval);
|
||||||
parent->waiting--;
|
parent->waiting--;
|
||||||
loop();
|
loop();
|
||||||
}
|
}
|
||||||
|
@ -209,7 +209,7 @@ struct cli_fix_t
|
||||||
if (rm_op->reply.hdr.retval < 0)
|
if (rm_op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to remove object %lx:%lx from OSD %lu (retval=%ld)\n",
|
stderr, "Failed to remove object %jx:%jx from OSD %ju (retval=%jd)\n",
|
||||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
|
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
|
||||||
rm_osd_num, rm_op->reply.hdr.retval
|
rm_osd_num, rm_op->reply.hdr.retval
|
||||||
);
|
);
|
||||||
|
@ -226,7 +226,7 @@ struct cli_fix_t
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Removed %lx:%lx (part %lu) from OSD %lu\n",
|
"Removed %jx:%jx (part %ju) from OSD %ju\n",
|
||||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
|
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
|
||||||
rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
|
rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
|
||||||
);
|
);
|
||||||
|
@ -254,7 +254,7 @@ struct cli_fix_t
|
||||||
if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
|
if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to scrub %lx:%lx on OSD %lu (retval=%ld)\n",
|
stderr, "Failed to scrub %jx:%jx on OSD %ju (retval=%jd)\n",
|
||||||
obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
|
obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -150,7 +150,7 @@ resume_1:
|
||||||
inode_t only_inode_num;
|
inode_t only_inode_num;
|
||||||
char null_byte = 0;
|
char null_byte = 0;
|
||||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
|
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
|
||||||
"/inode/stats/%u/%lu%c", &pool_id, &only_inode_num, &null_byte);
|
"/inode/stats/%u/%ju%c", &pool_id, &only_inode_num, &null_byte);
|
||||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
|
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||||
|
@ -456,7 +456,7 @@ std::string format_lat(uint64_t lat)
|
||||||
char buf[256];
|
char buf[256];
|
||||||
int l = 0;
|
int l = 0;
|
||||||
if (lat < 100)
|
if (lat < 100)
|
||||||
l = snprintf(buf, sizeof(buf), "%lu us", lat);
|
l = snprintf(buf, sizeof(buf), "%ju us", lat);
|
||||||
else if (lat < 500000)
|
else if (lat < 500000)
|
||||||
l = snprintf(buf, sizeof(buf), "%.2f ms", (double)lat/1000);
|
l = snprintf(buf, sizeof(buf), "%.2f ms", (double)lat/1000);
|
||||||
else
|
else
|
||||||
|
|
|
@ -202,7 +202,7 @@ struct snap_merger_t
|
||||||
if (parent->progress)
|
if (parent->progress)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
|
"Merging %zd layer(s) into target %s%s (inode %ju in pool %u)\n",
|
||||||
sources.size(), target_cfg->name.c_str(),
|
sources.size(), target_cfg->name.c_str(),
|
||||||
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
||||||
);
|
);
|
||||||
|
@ -275,7 +275,7 @@ struct snap_merger_t
|
||||||
processed++;
|
processed++;
|
||||||
if (parent->progress && !(processed % 128))
|
if (parent->progress && !(processed % 128))
|
||||||
{
|
{
|
||||||
printf("\rFiltering target blocks: %lu/%lu", processed, to_process);
|
printf("\rFiltering target blocks: %ju/%ju", processed, to_process);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (in_flight > 0 || oit != merge_offsets.end())
|
if (in_flight > 0 || oit != merge_offsets.end())
|
||||||
|
@ -285,7 +285,7 @@ struct snap_merger_t
|
||||||
}
|
}
|
||||||
if (parent->progress)
|
if (parent->progress)
|
||||||
{
|
{
|
||||||
printf("\r%lu full blocks of target filtered out\n", to_process-merge_offsets.size());
|
printf("\r%ju full blocks of target filtered out\n", to_process-merge_offsets.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
state = 3;
|
state = 3;
|
||||||
|
@ -320,7 +320,7 @@ struct snap_merger_t
|
||||||
processed++;
|
processed++;
|
||||||
if (parent->progress && !(processed % 128))
|
if (parent->progress && !(processed % 128))
|
||||||
{
|
{
|
||||||
printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
|
printf("\rOverwriting blocks: %ju/%ju", processed, to_process);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (in_flight == 0 && rwo_error.size())
|
if (in_flight == 0 && rwo_error.size())
|
||||||
|
@ -339,7 +339,7 @@ struct snap_merger_t
|
||||||
}
|
}
|
||||||
if (parent->progress)
|
if (parent->progress)
|
||||||
{
|
{
|
||||||
printf("\rOverwriting blocks: %lu/%lu\n", to_process, to_process);
|
printf("\rOverwriting blocks: %ju/%ju\n", to_process, to_process);
|
||||||
}
|
}
|
||||||
// Done
|
// Done
|
||||||
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
|
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
|
||||||
|
@ -384,7 +384,7 @@ struct snap_merger_t
|
||||||
auto & name = parent->cli->st_cli.inode_config.at(src).name;
|
auto & name = parent->cli->st_cli.inode_config.at(src).name;
|
||||||
if (parent->progress)
|
if (parent->progress)
|
||||||
{
|
{
|
||||||
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
printf("Got listing of layer %s (inode %ju in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||||
}
|
}
|
||||||
if (delete_source)
|
if (delete_source)
|
||||||
{
|
{
|
||||||
|
@ -416,7 +416,7 @@ struct snap_merger_t
|
||||||
{
|
{
|
||||||
if (op->retval < 0)
|
if (op->retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error reading target bitmap at offset %lx: %s\n", op->offset, strerror(-op->retval));
|
fprintf(stderr, "error reading target bitmap at offset %jx: %s\n", op->offset, strerror(-op->retval));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -571,7 +571,7 @@ struct snap_merger_t
|
||||||
{
|
{
|
||||||
if (subop->retval != 0)
|
if (subop->retval != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error deleting from layer 0x%lx at offset %lx: %s", subop->inode, subop->offset, strerror(-subop->retval));
|
fprintf(stderr, "error deleting from layer 0x%jx at offset %jx: %s", subop->inode, subop->offset, strerror(-subop->retval));
|
||||||
}
|
}
|
||||||
delete subop;
|
delete subop;
|
||||||
};
|
};
|
||||||
|
@ -620,7 +620,7 @@ struct snap_merger_t
|
||||||
if (rwo->error_code)
|
if (rwo->error_code)
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Error %s target at offset %lx: %s",
|
snprintf(buf, 1024, "Error %s target at offset %jx: %s",
|
||||||
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
|
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
|
||||||
rwo_error = std::string(buf);
|
rwo_error = std::string(buf);
|
||||||
}
|
}
|
||||||
|
|
|
@ -291,7 +291,7 @@ resume_100:
|
||||||
if (it == parent->cli->st_cli.inode_config.end())
|
if (it == parent->cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
|
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%jx) not found", cur->name.c_str(), cur->parent_id);
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -384,7 +384,7 @@ resume_100:
|
||||||
pool_id_t pool_id = 0;
|
pool_id_t pool_id = 0;
|
||||||
inode_t inode = 0;
|
inode_t inode = 0;
|
||||||
char null_byte = 0;
|
char null_byte = 0;
|
||||||
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%ju%c", &pool_id, &inode, &null_byte);
|
||||||
if (scanned != 2 || !inode)
|
if (scanned != 2 || !inode)
|
||||||
{
|
{
|
||||||
result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
|
result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
|
||||||
|
@ -439,7 +439,7 @@ resume_100:
|
||||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
|
snprintf(buf, 1024, "Inode 0x%jx disappeared", inverse_child);
|
||||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
|
@ -448,7 +448,7 @@ resume_100:
|
||||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
|
snprintf(buf, 1024, "Inode 0x%jx disappeared", inverse_parent);
|
||||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
|
@ -576,7 +576,7 @@ resume_100:
|
||||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
|
snprintf(buf, 1024, "Inode 0x%jx disappeared", cur);
|
||||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
|
@ -640,7 +640,7 @@ resume_100:
|
||||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
|
snprintf(buf, 1024, "Inode 0x%jx disappeared", child_inode);
|
||||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
|
@ -649,7 +649,7 @@ resume_100:
|
||||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
|
snprintf(buf, 1024, "Inode 0x%jx disappeared", target_inode);
|
||||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
|
@ -670,7 +670,7 @@ resume_100:
|
||||||
if (source == parent->cli->st_cli.inode_config.end())
|
if (source == parent->cli->st_cli.inode_config.end())
|
||||||
{
|
{
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
|
snprintf(buf, 1024, "Inode 0x%jx disappeared", inode);
|
||||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -95,7 +95,7 @@ struct rm_inode_t
|
||||||
fprintf(stderr, "Some data may remain after delete on OSDs which are currently down: ");
|
fprintf(stderr, "Some data may remain after delete on OSDs which are currently down: ");
|
||||||
for (int i = 0; i < inactive_osds.size(); i++)
|
for (int i = 0; i < inactive_osds.size(); i++)
|
||||||
{
|
{
|
||||||
fprintf(stderr, i > 0 ? ", %lu" : "%lu", inactive_osds[i]);
|
fprintf(stderr, i > 0 ? ", %ju" : "%ju", inactive_osds[i]);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
@ -138,7 +138,7 @@ struct rm_inode_t
|
||||||
cur_list->in_flight--;
|
cur_list->in_flight--;
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
fprintf(stderr, "Failed to remove object %jx:%jx from PG %u (OSD %ju) (retval=%jd)\n",
|
||||||
op->req.rw.inode, op->req.rw.offset,
|
op->req.rw.inode, op->req.rw.offset,
|
||||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||||
error_count++;
|
error_count++;
|
||||||
|
@ -174,7 +174,7 @@ struct rm_inode_t
|
||||||
cur_list->synced = true;
|
cur_list->synced = true;
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to sync OSD %lu (retval=%ld)\n",
|
fprintf(stderr, "Failed to sync OSD %ju (retval=%jd)\n",
|
||||||
cur_list->rm_osd_num, op->reply.hdr.retval);
|
cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||||
error_count++;
|
error_count++;
|
||||||
}
|
}
|
||||||
|
@ -212,7 +212,7 @@ struct rm_inode_t
|
||||||
}
|
}
|
||||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
fprintf(stderr, "\rRemoved %ju/%ju objects, %ju more PGs to list...", total_done, total_count, pgs_to_list);
|
||||||
total_prev_pct = total_done*1000/total_count;
|
total_prev_pct = total_done*1000/total_count;
|
||||||
}
|
}
|
||||||
if (lists_done && !lists.size())
|
if (lists_done && !lists.size())
|
||||||
|
@ -224,8 +224,8 @@ struct rm_inode_t
|
||||||
if (parent->progress && (total_done < total_count || inactive_osds.size() > 0 || error_count > 0))
|
if (parent->progress && (total_done < total_count || inactive_osds.size() > 0 || error_count > 0))
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Warning: Pool:%u,ID:%lu inode data may not have been fully removed.\n"
|
stderr, "Warning: Pool:%u,ID:%ju inode data may not have been fully removed.\n"
|
||||||
" Use `vitastor-cli rm-data --pool %u --inode %lu` if you encounter it in listings.\n",
|
" Use `vitastor-cli rm-data --pool %u --inode %ju` if you encounter it in listings.\n",
|
||||||
pool_id, INODE_NO_POOL(inode), pool_id, INODE_NO_POOL(inode)
|
pool_id, INODE_NO_POOL(inode), pool_id, INODE_NO_POOL(inode)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,7 +106,7 @@ resume_2:
|
||||||
if (etcd_states[i]["error"].is_null())
|
if (etcd_states[i]["error"].is_null())
|
||||||
{
|
{
|
||||||
etcd_alive++;
|
etcd_alive++;
|
||||||
etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
|
etcd_db_size = etcd_states[i]["dbSize"].uint64_value();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int mon_count = 0;
|
int mon_count = 0;
|
||||||
|
@ -132,7 +132,7 @@ resume_2:
|
||||||
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
|
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
|
||||||
osd_num_t stat_osd_num = 0;
|
osd_num_t stat_osd_num = 0;
|
||||||
char null_byte = 0;
|
char null_byte = 0;
|
||||||
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
|
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%ju%c", &stat_osd_num, &null_byte);
|
||||||
if (scanned != 1 || !stat_osd_num)
|
if (scanned != 1 || !stat_osd_num)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||||
|
@ -283,7 +283,7 @@ resume_2:
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
" cluster:\n"
|
" cluster:\n"
|
||||||
" etcd: %d / %ld up, %s database size\n"
|
" etcd: %d / %zd up, %s database size\n"
|
||||||
" mon: %d up%s\n"
|
" mon: %d up%s\n"
|
||||||
" osd: %d / %d up\n"
|
" osd: %d / %d up\n"
|
||||||
" \n"
|
" \n"
|
||||||
|
|
|
@ -1156,7 +1156,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||||
if (op->retval != -EPIPE || log_level > 0)
|
if (op->retval != -EPIPE || log_level > 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
stderr, "%s operation failed on OSD %ju: retval=%jd (expected %d), dropping connection\n",
|
||||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -1164,7 +1164,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||||
else if (log_level > 0)
|
else if (log_level > 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
|
stderr, "%s operation failed on OSD %ju: retval=%jd (expected %d)\n",
|
||||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -226,7 +226,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||||
{
|
{
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
|
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %ju (retval=%jd), skipping\n",
|
||||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -236,7 +236,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||||
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
||||||
cur_list->pg->has_unstable = true;
|
cur_list->pg->has_unstable = true;
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "[PG %u/%u] Inode still has %lu unstable object versions out of total %lu - is it still open?\n",
|
stderr, "[PG %u/%u] Inode still has %ju unstable object versions out of total %ju - is it still open?\n",
|
||||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count,
|
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count,
|
||||||
op->reply.hdr.retval
|
op->reply.hdr.retval
|
||||||
);
|
);
|
||||||
|
@ -244,7 +244,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
stderr, "[PG %u/%u] Got inode object list from OSD %ju: %jd object versions\n",
|
||||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,7 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||||
if (!bitmap_granularity)
|
if (!bitmap_granularity)
|
||||||
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||||
if (!journal_size)
|
if (!journal_size)
|
||||||
journal_size = 16*1024*1024;
|
journal_size = 32*1024*1024;
|
||||||
if (!device_block_size)
|
if (!device_block_size)
|
||||||
device_block_size = 4096;
|
device_block_size = 4096;
|
||||||
if (!data_csum_type)
|
if (!data_csum_type)
|
||||||
|
@ -75,9 +75,9 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||||
if (st.st_blksize < device_block_size)
|
if (st.st_blksize < device_block_size)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Warning: %s reports %lu byte blocks, but we use %lu."
|
stderr, "Warning: %s reports %ju byte blocks, but we use %ju."
|
||||||
" Set --device_block_size=%lu if you're sure it works well with %lu byte blocks.\n",
|
" Set --device_block_size=%ju if you're sure it works well with %ju byte blocks.\n",
|
||||||
device.c_str(), st.st_blksize, device_block_size, st.st_blksize, st.st_blksize
|
device.c_str(), (uint64_t)st.st_blksize, device_block_size, (uint64_t)st.st_blksize, (uint64_t)st.st_blksize
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -99,19 +99,19 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||||
if (device_block_size < 512 || device_block_size > 1048576 ||
|
if (device_block_size < 512 || device_block_size > 1048576 ||
|
||||||
device_block_size & (device_block_size-1) != 0)
|
device_block_size & (device_block_size-1) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid device block size specified: %lu\n", device_block_size);
|
fprintf(stderr, "Invalid device block size specified: %ju\n", device_block_size);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (data_block_size < device_block_size || data_block_size > MAX_DATA_BLOCK_SIZE ||
|
if (data_block_size < device_block_size || data_block_size > MAX_DATA_BLOCK_SIZE ||
|
||||||
data_block_size & (data_block_size-1) != 0)
|
data_block_size & (data_block_size-1) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid object size specified: %lu\n", data_block_size);
|
fprintf(stderr, "Invalid object size specified: %ju\n", data_block_size);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (bitmap_granularity < device_block_size || bitmap_granularity > data_block_size ||
|
if (bitmap_granularity < device_block_size || bitmap_granularity > data_block_size ||
|
||||||
bitmap_granularity & (bitmap_granularity-1) != 0)
|
bitmap_granularity & (bitmap_granularity-1) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid bitmap granularity specified: %lu\n", bitmap_granularity);
|
fprintf(stderr, "Invalid bitmap granularity specified: %ju\n", bitmap_granularity);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (csum_block_size && (data_block_size % csum_block_size))
|
if (csum_block_size && (data_block_size % csum_block_size))
|
||||||
|
@ -145,8 +145,8 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||||
{
|
{
|
||||||
// Env
|
// Env
|
||||||
printf(
|
printf(
|
||||||
"meta_block_size=%lu\njournal_block_size=%lu\ndata_size=%lu\n"
|
"meta_block_size=%ju\njournal_block_size=%ju\ndata_size=%ju\n"
|
||||||
"data_device=%s\njournal_offset=%lu\nmeta_offset=%lu\ndata_offset=%lu\n",
|
"data_device=%s\njournal_offset=%ju\nmeta_offset=%ju\ndata_offset=%ju\n",
|
||||||
device_block_size, device_block_size, device_size-data_offset,
|
device_block_size, device_block_size, device_size-data_offset,
|
||||||
device.c_str(), journal_offset, meta_offset, data_offset
|
device.c_str(), journal_offset, meta_offset, data_offset
|
||||||
);
|
);
|
||||||
|
@ -160,14 +160,14 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||||
}
|
}
|
||||||
if (device_block_size != 4096)
|
if (device_block_size != 4096)
|
||||||
{
|
{
|
||||||
printf("--meta_block_size %lu\n--journal_block_size %lu\n", device_block_size, device_block_size);
|
printf("--meta_block_size %ju\n--journal_block_size %ju\n", device_block_size, device_block_size);
|
||||||
}
|
}
|
||||||
if (orig_device_size)
|
if (orig_device_size)
|
||||||
{
|
{
|
||||||
printf("--data_size %lu\n", device_size-data_offset);
|
printf("--data_size %ju\n", device_size-data_offset);
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"--data_device %s\n--journal_offset %lu\n--meta_offset %lu\n--data_offset %lu\n",
|
"--data_device %s\n--journal_offset %ju\n--meta_offset %ju\n--data_offset %ju\n",
|
||||||
device.c_str(), journal_offset, meta_offset, data_offset
|
device.c_str(), journal_offset, meta_offset, data_offset
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,7 +167,7 @@ static const char *help_text =
|
||||||
" Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n"
|
" Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n"
|
||||||
" --object_size 128k Set blockstore block size\n"
|
" --object_size 128k Set blockstore block size\n"
|
||||||
" --bitmap_granularity 4k Set bitmap granularity\n"
|
" --bitmap_granularity 4k Set bitmap granularity\n"
|
||||||
" --journal_size 16M Set journal size\n"
|
" --journal_size 32M Set journal size\n"
|
||||||
" --data_csum_type none Set data checksum type (crc32c or none)\n"
|
" --data_csum_type none Set data checksum type (crc32c or none)\n"
|
||||||
" --csum_block_size 4k Set data checksum block size\n"
|
" --csum_block_size 4k Set data checksum block size\n"
|
||||||
" --device_block_size 4k Set device block size\n"
|
" --device_block_size 4k Set device block size\n"
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#ifndef _LARGEFILE64_SOURCE
|
#ifndef _LARGEFILE64_SOURCE
|
||||||
#define _LARGEFILE64_SOURCE 1
|
#define _LARGEFILE64_SOURCE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
|
@ -38,7 +38,7 @@ int disk_tool_t::dump_journal()
|
||||||
}
|
}
|
||||||
if (json)
|
if (json)
|
||||||
{
|
{
|
||||||
printf("%s{\"offset\":\"0x%lx\"", first_block ? "" : ",\n", journal_pos);
|
printf("%s{\"offset\":\"0x%jx\"", first_block ? "" : ",\n", journal_pos);
|
||||||
first_block = false;
|
first_block = false;
|
||||||
}
|
}
|
||||||
if (s == dsk.journal_block_size)
|
if (s == dsk.journal_block_size)
|
||||||
|
@ -46,13 +46,13 @@ int disk_tool_t::dump_journal()
|
||||||
if (json)
|
if (json)
|
||||||
printf(",\"type\":\"zero\"}");
|
printf(",\"type\":\"zero\"}");
|
||||||
else
|
else
|
||||||
printf("offset %08lx: zeroes\n", journal_pos);
|
printf("offset %08jx: zeroes\n", journal_pos);
|
||||||
journal_pos += dsk.journal_block_size;
|
journal_pos += dsk.journal_block_size;
|
||||||
}
|
}
|
||||||
else if (((journal_entry*)journal_buf)->magic == JOURNAL_MAGIC)
|
else if (((journal_entry*)journal_buf)->magic == JOURNAL_MAGIC)
|
||||||
{
|
{
|
||||||
if (!json)
|
if (!json)
|
||||||
printf("offset %08lx:\n", journal_pos);
|
printf("offset %08jx:\n", journal_pos);
|
||||||
else
|
else
|
||||||
printf(",\"entries\":[\n");
|
printf(",\"entries\":[\n");
|
||||||
if (journal_pos == 0)
|
if (journal_pos == 0)
|
||||||
|
@ -80,9 +80,9 @@ int disk_tool_t::dump_journal()
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (json)
|
if (json)
|
||||||
printf(",\"type\":\"data\",\"pattern\":\"%08lx\"}", *((uint64_t*)journal_buf));
|
printf(",\"type\":\"data\",\"pattern\":\"%08jx\"}", *((uint64_t*)journal_buf));
|
||||||
else
|
else
|
||||||
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%08lx)\n", journal_pos, *((uint64_t*)journal_buf));
|
printf("offset %08jx: no magic in the beginning, looks like random data (pattern=%08jx)\n", journal_pos, *((uint64_t*)journal_buf));
|
||||||
journal_pos += dsk.journal_block_size;
|
journal_pos += dsk.journal_block_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -98,12 +98,12 @@ int disk_tool_t::dump_journal()
|
||||||
if (json && dump_with_blocks)
|
if (json && dump_with_blocks)
|
||||||
first_entry = true;
|
first_entry = true;
|
||||||
if (!json)
|
if (!json)
|
||||||
printf("offset %08lx:\n", journal_pos);
|
printf("offset %08jx:\n", journal_pos);
|
||||||
auto pos = journal_pos;
|
auto pos = journal_pos;
|
||||||
int r = process_journal_block(data, [this, pos](int num, journal_entry *je)
|
int r = process_journal_block(data, [this, pos](int num, journal_entry *je)
|
||||||
{
|
{
|
||||||
if (json && dump_with_blocks && first_entry)
|
if (json && dump_with_blocks && first_entry)
|
||||||
printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first_block ? "" : ",\n", pos);
|
printf("%s{\"offset\":\"0x%jx\",\"entries\":[\n", first_block ? "" : ",\n", pos);
|
||||||
dump_journal_entry(num, je, json);
|
dump_journal_entry(num, je, json);
|
||||||
first_block = false;
|
first_block = false;
|
||||||
});
|
});
|
||||||
|
@ -134,12 +134,12 @@ int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
|
||||||
journal_entry *je = (journal_entry*)(data);
|
journal_entry *je = (journal_entry*)(data);
|
||||||
if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
|
if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "offset %08lx: journal superblock is invalid\n", journal_pos);
|
fprintf(stderr, "offset %08jx: journal superblock is invalid\n", journal_pos);
|
||||||
r = 1;
|
r = 1;
|
||||||
}
|
}
|
||||||
else if (je->start.size != JE_START_V0_SIZE && je->start.version != JOURNAL_VERSION_V1 && je->start.version != JOURNAL_VERSION_V2)
|
else if (je->start.size != JE_START_V0_SIZE && je->start.version != JOURNAL_VERSION_V1 && je->start.version != JOURNAL_VERSION_V2)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "offset %08lx: journal superblock contains version %lu, but I only understand 0, 1 and 2\n",
|
fprintf(stderr, "offset %08jx: journal superblock contains version %ju, but I only understand 0, 1 and 2\n",
|
||||||
journal_pos, je->start.size == JE_START_V0_SIZE ? 0 : je->start.version);
|
journal_pos, je->start.size == JE_START_V0_SIZE ? 0 : je->start.version);
|
||||||
r = 1;
|
r = 1;
|
||||||
}
|
}
|
||||||
|
@ -296,7 +296,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||||
if (je->type == JE_START)
|
if (je->type == JE_START)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
json ? ",\"type\":\"start\",\"start\":\"0x%lx\"" : "je_start start=%08lx",
|
json ? ",\"type\":\"start\",\"start\":\"0x%jx\"" : "je_start start=%08jx",
|
||||||
je->start.journal_start
|
je->start.journal_start
|
||||||
);
|
);
|
||||||
if (je->start.data_csum_type)
|
if (je->start.data_csum_type)
|
||||||
|
@ -312,15 +312,15 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||||
{
|
{
|
||||||
auto & sw = je->small_write;
|
auto & sw = je->small_write;
|
||||||
printf(
|
printf(
|
||||||
json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
|
json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%jx\""
|
||||||
: "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
|
: "je_small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u loc=%08jx",
|
||||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||||
sw.oid.inode, sw.oid.stripe, sw.version, sw.offset, sw.len, sw.data_offset
|
sw.oid.inode, sw.oid.stripe, sw.version, sw.offset, sw.len, sw.data_offset
|
||||||
);
|
);
|
||||||
if (journal_calc_data_pos != sw.data_offset)
|
if (journal_calc_data_pos != sw.data_offset)
|
||||||
{
|
{
|
||||||
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
|
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%jx\""
|
||||||
: " (mismatched, calculated = %08lx)", journal_pos);
|
: " (mismatched, calculated = %08jx)", journal_pos);
|
||||||
}
|
}
|
||||||
uint32_t data_csum_size = (!je_start.csum_block_size
|
uint32_t data_csum_size = (!je_start.csum_block_size
|
||||||
? 0
|
? 0
|
||||||
|
@ -367,8 +367,8 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||||
{
|
{
|
||||||
auto & bw = je->big_write;
|
auto & bw = je->big_write;
|
||||||
printf(
|
printf(
|
||||||
json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
|
json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%jx\""
|
||||||
: "je_big_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
|
: "je_big_write%s oid=%jx:%jx ver=%ju offset=%u len=%u loc=%08jx",
|
||||||
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
|
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
|
||||||
bw.oid.inode, bw.oid.stripe, bw.version, bw.offset, bw.len, bw.location
|
bw.oid.inode, bw.oid.stripe, bw.version, bw.offset, bw.len, bw.location
|
||||||
);
|
);
|
||||||
|
@ -398,24 +398,24 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||||
else if (je->type == JE_STABLE)
|
else if (je->type == JE_STABLE)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
json ? ",\"type\":\"stable\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
|
json ? ",\"type\":\"stable\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\"}"
|
||||||
: "je_stable oid=%lx:%lx ver=%lu\n",
|
: "je_stable oid=%jx:%jx ver=%ju\n",
|
||||||
je->stable.oid.inode, je->stable.oid.stripe, je->stable.version
|
je->stable.oid.inode, je->stable.oid.stripe, je->stable.version
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else if (je->type == JE_ROLLBACK)
|
else if (je->type == JE_ROLLBACK)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
json ? ",\"type\":\"rollback\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
|
json ? ",\"type\":\"rollback\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\"}"
|
||||||
: "je_rollback oid=%lx:%lx ver=%lu\n",
|
: "je_rollback oid=%jx:%jx ver=%ju\n",
|
||||||
je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version
|
je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else if (je->type == JE_DELETE)
|
else if (je->type == JE_DELETE)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
json ? ",\"type\":\"delete\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
|
json ? ",\"type\":\"delete\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\"}"
|
||||||
: "je_delete oid=%lx:%lx ver=%lu\n",
|
: "je_delete oid=%jx:%jx ver=%ju\n",
|
||||||
je->del.oid.inode, je->del.oid.stripe, je->del.version
|
je->del.oid.inode, je->del.oid.stripe, je->del.version
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,7 +54,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Unsupported version
|
// Unsupported version
|
||||||
fprintf(stderr, "Metadata format is too new for me (stored version is %lu, max supported %u).\n", hdr->version, BLOCKSTORE_META_FORMAT_V2);
|
fprintf(stderr, "Metadata format is too new for me (stored version is %ju, max supported %u).\n", hdr->version, BLOCKSTORE_META_FORMAT_V2);
|
||||||
free(data);
|
free(data);
|
||||||
close(dsk.meta_fd);
|
close(dsk.meta_fd);
|
||||||
dsk.meta_fd = -1;
|
dsk.meta_fd = -1;
|
||||||
|
@ -108,7 +108,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||||
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + dsk.clean_entry_size - 4);
|
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + dsk.clean_entry_size - 4);
|
||||||
if (*entry_csum != crc32c(0, entry, dsk.clean_entry_size - 4))
|
if (*entry_csum != crc32c(0, entry, dsk.clean_entry_size - 4))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Metadata entry %lu is corrupt (checksum mismatch), skipping\n", block_num);
|
fprintf(stderr, "Metadata entry %ju is corrupt (checksum mismatch), skipping\n", block_num);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -184,7 +184,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("{\"version\":\"0.5\",\"meta_block_size\":%lu,\"entries\":[\n", dsk.meta_block_size);
|
printf("{\"version\":\"0.5\",\"meta_block_size\":%ju,\"entries\":[\n", dsk.meta_block_size);
|
||||||
}
|
}
|
||||||
first_entry = true;
|
first_entry = true;
|
||||||
}
|
}
|
||||||
|
@ -192,7 +192,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
|
||||||
void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"version\":%lu"
|
#define ENTRY_FMT "{\"block\":%ju,\"pool\":%u,\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"version\":%ju"
|
||||||
(first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
|
(first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
|
||||||
#undef ENTRY_FMT
|
#undef ENTRY_FMT
|
||||||
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
|
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
|
||||||
|
@ -265,7 +265,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
|
||||||
{
|
{
|
||||||
free(new_meta_buf);
|
free(new_meta_buf);
|
||||||
new_meta_buf = NULL;
|
new_meta_buf = NULL;
|
||||||
fprintf(stderr, "Metadata (data block %lu) doesn't fit into the new area\n", data_block);
|
fprintf(stderr, "Metadata (data block %ju) doesn't fit into the new area\n", data_block);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf +
|
clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf +
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
|
int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
|
||||||
{
|
{
|
||||||
static const char *allow_additional_params[] = {
|
static const char *allow_additional_params[] = {
|
||||||
|
"autosync_writes",
|
||||||
"data_io",
|
"data_io",
|
||||||
"meta_io",
|
"meta_io",
|
||||||
"journal_io",
|
"journal_io",
|
||||||
|
@ -99,12 +100,9 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
options["disable_journal_fsync"] = options["disable_data_fsync"];
|
options["disable_journal_fsync"] = options["disable_data_fsync"];
|
||||||
}
|
}
|
||||||
// Calculate offsets if the same device is used for two or more of data, meta, and journal
|
// Calculate offsets if the same device is used for two or more of data, meta, and journal
|
||||||
if (options["journal_size"] == "")
|
if (options["journal_size"] == "" && (options["journal_device"] == "" || options["journal_device"] == options["data_device"]))
|
||||||
{
|
{
|
||||||
if (options["journal_device"] == "")
|
options["journal_size"] = is_hdd || !json_is_true(options["disable_data_fsync"]) ? "128M" : "32M";
|
||||||
options["journal_size"] = is_hdd ? "128M" : "32M";
|
|
||||||
else if (is_hdd)
|
|
||||||
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
|
|
||||||
}
|
}
|
||||||
bool is_hybrid = is_hdd && options["journal_device"] != "" && options["journal_device"] != options["data_device"];
|
bool is_hybrid = is_hdd && options["journal_device"] != "" && options["journal_device"] != options["data_device"];
|
||||||
if (is_hdd)
|
if (is_hdd)
|
||||||
|
@ -114,6 +112,15 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
if (is_hybrid && options["throttle_small_writes"] == "")
|
if (is_hybrid && options["throttle_small_writes"] == "")
|
||||||
options["throttle_small_writes"] = "1";
|
options["throttle_small_writes"] = "1";
|
||||||
}
|
}
|
||||||
|
else if (!json_is_true(options["disable_data_fsync"]))
|
||||||
|
{
|
||||||
|
if (options.find("min_flusher_count") == options.end())
|
||||||
|
options["min_flusher_count"] = "32";
|
||||||
|
if (options.find("max_flusher_count") == options.end())
|
||||||
|
options["max_flusher_count"] = "256";
|
||||||
|
if (options.find("autosync_writes") == options.end())
|
||||||
|
options["autosync_writes"] = "512";
|
||||||
|
}
|
||||||
json11::Json::object sb;
|
json11::Json::object sb;
|
||||||
blockstore_disk_t dsk;
|
blockstore_disk_t dsk;
|
||||||
try
|
try
|
||||||
|
@ -203,10 +210,10 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
desc += " with metadata on "+realpath_str(options["meta_device"]);
|
desc += " with metadata on "+realpath_str(options["meta_device"]);
|
||||||
if (sep_j)
|
if (sep_j)
|
||||||
desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
|
desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
|
||||||
fprintf(stderr, "Initialized OSD %lu on %s\n", osd_num, desc.c_str());
|
fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str());
|
||||||
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
|
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%lu\n", osd_num);
|
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -330,7 +337,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
|
||||||
std::string out;
|
std::string out;
|
||||||
if (shell_exec({ "sfdisk", "--no-reread", "--force", devinfo.path }, script, &out, NULL) != 0)
|
if (shell_exec({ "sfdisk", "--no-reread", "--force", devinfo.path }, script, &out, NULL) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to add %lu partition(s) with sfdisk\n", sizes.size());
|
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk\n", sizes.size());
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
// Get new partition table and find created partitions
|
// Get new partition table and find created partitions
|
||||||
|
@ -345,7 +352,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
|
||||||
}
|
}
|
||||||
if (new_parts.size() != sizes.size())
|
if (new_parts.size() != sizes.size())
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to add %lu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
|
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
// Check if new nodes exist and run partprobe if not
|
// Check if new nodes exist and run partprobe if not
|
||||||
|
@ -449,7 +456,7 @@ std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & d
|
||||||
bool is_journal = sb["params"]["journal_device"].string_value() == part_path;
|
bool is_journal = sb["params"]["journal_device"].string_value() == part_path;
|
||||||
bool is_data = sb["params"]["data_device"].string_value() == part_path;
|
bool is_data = sb["params"]["data_device"].string_value() == part_path;
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "%s is already initialized for OSD %lu%s, skipping\n",
|
stderr, "%s is already initialized for OSD %ju%s, skipping\n",
|
||||||
part["node"].string_value().c_str(), sb["params"]["osd_num"].uint64_value(),
|
part["node"].string_value().c_str(), sb["params"]["osd_num"].uint64_value(),
|
||||||
(is_data ? " data" : (is_meta ? " meta" : (is_journal ? " journal" : "")))
|
(is_data ? " data" : (is_meta ? " meta" : (is_journal ? " journal" : "")))
|
||||||
);
|
);
|
||||||
|
@ -532,7 +539,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
|
||||||
if (sel < 0)
|
if (sel < 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Could not find free space for new SSD journal and metadata (need %lu + %lu MiB)\n",
|
stderr, "Could not find free space for new SSD journal and metadata (need %ju + %ju MiB)\n",
|
||||||
meta_size/1024/1024, journal_size/1024/1024
|
meta_size/1024/1024, journal_size/1024/1024
|
||||||
);
|
);
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -616,6 +623,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||||
options.erase("disable_meta_fsync");
|
options.erase("disable_meta_fsync");
|
||||||
options.erase("disable_journal_fsync");
|
options.erase("disable_journal_fsync");
|
||||||
}
|
}
|
||||||
|
auto journal_size = options["journal_size"];
|
||||||
for (auto & dev: devinfo)
|
for (auto & dev: devinfo)
|
||||||
{
|
{
|
||||||
if (!hybrid || dev.is_hdd)
|
if (!hybrid || dev.is_hdd)
|
||||||
|
@ -633,11 +641,13 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
options.erase("journal_size");
|
||||||
}
|
}
|
||||||
// Treat all disks as SSDs if not in the hybrid mode
|
// Treat all disks as SSDs if not in the hybrid mode
|
||||||
prepare_one(options, dev.is_hdd ? 1 : 0);
|
prepare_one(options, dev.is_hdd ? 1 : 0);
|
||||||
if (hybrid)
|
if (hybrid)
|
||||||
{
|
{
|
||||||
|
options["journal_size"] = journal_size;
|
||||||
options.erase("journal_device");
|
options.erase("journal_device");
|
||||||
options.erase("meta_device");
|
options.erase("meta_device");
|
||||||
}
|
}
|
||||||
|
|
|
@ -184,7 +184,7 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
|
||||||
}
|
}
|
||||||
if (new_meta_len < dsk.meta_block_size*new_meta_blocks)
|
if (new_meta_len < dsk.meta_block_size*new_meta_blocks)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "New metadata area size is too small, should be at least %lu bytes\n", dsk.meta_block_size*new_meta_blocks);
|
fprintf(stderr, "New metadata area size is too small, should be at least %ju bytes\n", dsk.meta_block_size*new_meta_blocks);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
// Check that new metadata, journal and data areas don't overlap
|
// Check that new metadata, journal and data areas don't overlap
|
||||||
|
@ -289,7 +289,7 @@ int disk_tool_t::resize_copy_data()
|
||||||
if (data->res != dsk.data_block_size)
|
if (data->res != dsk.data_block_size)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to read %u bytes at %lu from %s: %s\n", dsk.data_block_size,
|
stderr, "Failed to read %u bytes at %ju from %s: %s\n", dsk.data_block_size,
|
||||||
dsk.data_offset + moving_blocks[i].old_loc*dsk.data_block_size, dsk.data_device.c_str(),
|
dsk.data_offset + moving_blocks[i].old_loc*dsk.data_block_size, dsk.data_device.c_str(),
|
||||||
data->res < 0 ? strerror(-data->res) : "short read"
|
data->res < 0 ? strerror(-data->res) : "short read"
|
||||||
);
|
);
|
||||||
|
@ -314,7 +314,7 @@ int disk_tool_t::resize_copy_data()
|
||||||
if (data->res != dsk.data_block_size)
|
if (data->res != dsk.data_block_size)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to write %u bytes at %lu to %s: %s\n", dsk.data_block_size,
|
stderr, "Failed to write %u bytes at %ju to %s: %s\n", dsk.data_block_size,
|
||||||
dsk.data_offset + moving_blocks[i].new_loc*dsk.data_block_size, dsk.data_device.c_str(),
|
dsk.data_offset + moving_blocks[i].new_loc*dsk.data_block_size, dsk.data_device.c_str(),
|
||||||
data->res < 0 ? strerror(-data->res) : "short write"
|
data->res < 0 ? strerror(-data->res) : "short write"
|
||||||
);
|
);
|
||||||
|
|
|
@ -43,8 +43,8 @@ int disk_tool_t::udev_import(std::string device)
|
||||||
}
|
}
|
||||||
uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
|
uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
|
||||||
// Print variables for udev
|
// Print variables for udev
|
||||||
printf("VITASTOR_OSD_NUM=%lu\n", osd_num);
|
printf("VITASTOR_OSD_NUM=%ju\n", osd_num);
|
||||||
printf("VITASTOR_ALIAS=osd%lu-%s\n", osd_num, sb["device_type"].string_value().c_str());
|
printf("VITASTOR_ALIAS=osd%ju-%s\n", osd_num, sb["device_type"].string_value().c_str());
|
||||||
printf("VITASTOR_DATA_DEVICE=%s\n", udev_escape(sb["params"]["data_device"].string_value()).c_str());
|
printf("VITASTOR_DATA_DEVICE=%s\n", udev_escape(sb["params"]["data_device"].string_value()).c_str());
|
||||||
if (sb["real_meta_device"].string_value() != "" && sb["real_meta_device"] != sb["real_data_device"])
|
if (sb["real_meta_device"].string_value() != "" && sb["real_meta_device"] != sb["real_data_device"])
|
||||||
printf("VITASTOR_META_DEVICE=%s\n", udev_escape(sb["params"]["meta_device"].string_value()).c_str());
|
printf("VITASTOR_META_DEVICE=%s\n", udev_escape(sb["params"]["meta_device"].string_value()).c_str());
|
||||||
|
@ -466,12 +466,12 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
|
||||||
close(fd);
|
close(fd);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to clear OSD %lu %s device %s superblock: %s\n",
|
fprintf(stderr, "Failed to clear OSD %ju %s device %s superblock: %s\n",
|
||||||
sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str(), strerror(errno));
|
sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str(), strerror(errno));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "OSD %lu %s device %s superblock cleared\n",
|
fprintf(stderr, "OSD %ju %s device %s superblock cleared\n",
|
||||||
sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str());
|
sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str());
|
||||||
}
|
}
|
||||||
if (sb["params"][dev_type+"_device"].string_value().substr(0, 22) == "/dev/disk/by-partuuid/")
|
if (sb["params"][dev_type+"_device"].string_value().substr(0, 22) == "/dev/disk/by-partuuid/")
|
||||||
|
|
|
@ -12,9 +12,9 @@ uint64_t sscanf_json(const char *fmt, const json11::Json & str)
|
||||||
{
|
{
|
||||||
uint64_t value = 0;
|
uint64_t value = 0;
|
||||||
if (fmt)
|
if (fmt)
|
||||||
sscanf(str.string_value().c_str(), "%lx", &value);
|
sscanf(str.string_value().c_str(), "%jx", &value);
|
||||||
else if (str.string_value().size() > 2 && (str.string_value()[0] == '0' && str.string_value()[1] == 'x'))
|
else if (str.string_value().size() > 2 && (str.string_value()[0] == '0' && str.string_value()[1] == 'x'))
|
||||||
sscanf(str.string_value().c_str(), "0x%lx", &value);
|
sscanf(str.string_value().c_str(), "0x%jx", &value);
|
||||||
else
|
else
|
||||||
value = str.uint64_value();
|
value = str.uint64_value();
|
||||||
return value;
|
return value;
|
||||||
|
|
|
@ -333,7 +333,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||||
etcd_watch_ws = NULL;
|
etcd_watch_ws = NULL;
|
||||||
}
|
}
|
||||||
if (this->log_level > 1)
|
if (this->log_level > 1)
|
||||||
fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %lu\n", etcd_address.c_str(), etcd_watch_revision);
|
fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %ju\n", etcd_address.c_str(), etcd_watch_revision);
|
||||||
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
|
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
|
||||||
[this, cur_addr = selected_etcd_address](const http_response_t *msg)
|
[this, cur_addr = selected_etcd_address](const http_response_t *msg)
|
||||||
{
|
{
|
||||||
|
@ -357,7 +357,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||||
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||||
etcd_watches_initialised++;
|
etcd_watches_initialised++;
|
||||||
if (etcd_watches_initialised == ETCD_TOTAL_WATCHES && this->log_level > 0)
|
if (etcd_watches_initialised == ETCD_TOTAL_WATCHES && this->log_level > 0)
|
||||||
fprintf(stderr, "Successfully subscribed to etcd at %s, revision %lu\n", cur_addr.c_str(), etcd_watch_revision);
|
fprintf(stderr, "Successfully subscribed to etcd at %s, revision %ju\n", cur_addr.c_str(), etcd_watch_revision);
|
||||||
}
|
}
|
||||||
if (data["result"]["canceled"].bool_value())
|
if (data["result"]["canceled"].bool_value())
|
||||||
{
|
{
|
||||||
|
@ -371,7 +371,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||||
// check to not trigger on_reload_hook multiple times
|
// check to not trigger on_reload_hook multiple times
|
||||||
if (etcd_watch_ws != NULL)
|
if (etcd_watch_ws != NULL)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
|
fprintf(stderr, "Revisions before %ju were compacted by etcd, reloading state\n",
|
||||||
data["result"]["compact_revision"].uint64_value());
|
data["result"]["compact_revision"].uint64_value());
|
||||||
http_close(etcd_watch_ws);
|
http_close(etcd_watch_ws);
|
||||||
etcd_watch_ws = NULL;
|
etcd_watch_ws = NULL;
|
||||||
|
@ -382,7 +382,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, exiting\n",
|
fprintf(stderr, "Revisions before %ju were compacted by etcd, exiting\n",
|
||||||
data["result"]["compact_revision"].uint64_value());
|
data["result"]["compact_revision"].uint64_value());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
@ -646,7 +646,7 @@ void etcd_state_client_t::load_pgs()
|
||||||
etcd_watch_revision = data["header"]["revision"].uint64_value()+1;
|
etcd_watch_revision = data["header"]["revision"].uint64_value()+1;
|
||||||
if (this->log_level > 3)
|
if (this->log_level > 3)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Loaded revision %lu of PG configuration\n", etcd_watch_revision-1);
|
fprintf(stderr, "Loaded revision %ju of PG configuration\n", etcd_watch_revision-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto & res: data["responses"].array_items())
|
for (auto & res: data["responses"].array_items())
|
||||||
|
@ -740,7 +740,7 @@ void etcd_state_client_t::clean_nonexistent_pgs()
|
||||||
{
|
{
|
||||||
if (seen_peers.find(peer_item.first) == seen_peers.end())
|
if (seen_peers.find(peer_item.first) == seen_peers.end())
|
||||||
{
|
{
|
||||||
fprintf(stderr, "OSD %lu state disappeared after reload, forgetting it\n", peer_item.first);
|
fprintf(stderr, "OSD %ju state disappeared after reload, forgetting it\n", peer_item.first);
|
||||||
parse_state((etcd_kv_t){
|
parse_state((etcd_kv_t){
|
||||||
.key = etcd_prefix+"/osd/state/"+std::to_string(peer_item.first),
|
.key = etcd_prefix+"/osd/state/"+std::to_string(peer_item.first),
|
||||||
});
|
});
|
||||||
|
@ -890,7 +890,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
{
|
{
|
||||||
if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
|
if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
|
fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %zu != pool pg_size %ju\n",
|
||||||
pool_id, pg_item.first, pg_item.second.target_set.size(), parsed_cfg.pg_size);
|
pool_id, pg_item.first, pg_item.second.target_set.size(), parsed_cfg.pg_size);
|
||||||
pg_item.second.pause = true;
|
pg_item.second.pause = true;
|
||||||
}
|
}
|
||||||
|
@ -936,7 +936,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
}
|
}
|
||||||
if (parsed_cfg.target_set.size() != pool_config[pool_id].pg_size)
|
if (parsed_cfg.target_set.size() != pool_config[pool_id].pg_size)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
|
fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %zu != pool pg_size %ju\n",
|
||||||
pool_id, pg_num, parsed_cfg.target_set.size(), pool_config[pool_id].pg_size);
|
pool_id, pg_num, parsed_cfg.target_set.size(), pool_config[pool_id].pg_size);
|
||||||
parsed_cfg.pause = true;
|
parsed_cfg.pause = true;
|
||||||
}
|
}
|
||||||
|
@ -950,7 +950,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
if (pg_it->second.config_exists && pg_it->first != ++n)
|
if (pg_it->second.config_exists && pg_it->first != ++n)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Invalid pool %u PG configuration: PG numbers don't cover whole 1..%lu range\n",
|
stderr, "Invalid pool %u PG configuration: PG numbers don't cover whole 1..%zu range\n",
|
||||||
pool_item.second.id, pool_item.second.pg_config.size()
|
pool_item.second.id, pool_item.second.pg_config.size()
|
||||||
);
|
);
|
||||||
for (pg_it = pool_item.second.pg_config.begin(); pg_it != pool_item.second.pg_config.end(); pg_it++)
|
for (pg_it = pool_item.second.pg_config.begin(); pg_it != pool_item.second.pg_config.end(); pg_it++)
|
||||||
|
@ -1066,7 +1066,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
(state & PG_PEERING) && state != PG_PEERING ||
|
(state & PG_PEERING) && state != PG_PEERING ||
|
||||||
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
|
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Unexpected pool %u PG %u state in etcd: primary=%lu, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
|
fprintf(stderr, "Unexpected pool %u PG %u state in etcd: primary=%ju, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pg_cfg.cur_primary = cur_primary;
|
pg_cfg.cur_primary = cur_primary;
|
||||||
|
@ -1102,7 +1102,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
uint64_t pool_id = 0;
|
uint64_t pool_id = 0;
|
||||||
uint64_t inode_num = 0;
|
uint64_t inode_num = 0;
|
||||||
char null_byte = 0;
|
char null_byte = 0;
|
||||||
int scanned = sscanf(key.c_str() + etcd_prefix.length()+14, "%lu/%lu%c", &pool_id, &inode_num, &null_byte);
|
int scanned = sscanf(key.c_str() + etcd_prefix.length()+14, "%ju/%ju%c", &pool_id, &inode_num, &null_byte);
|
||||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)))
|
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||||
|
@ -1145,7 +1145,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
else if (parent_pool_id >= POOL_ID_MAX)
|
else if (parent_pool_id >= POOL_ID_MAX)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
|
stderr, "Inode %ju/%ju parent_pool value is invalid, ignoring parent setting\n",
|
||||||
inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
|
inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
|
||||||
);
|
);
|
||||||
parent_inode_num = 0;
|
parent_inode_num = 0;
|
||||||
|
|
|
@ -377,7 +377,7 @@ static void io_callback(void *opaque, long retval)
|
||||||
bsd->completed.push_back(io);
|
bsd->completed.push_back(io);
|
||||||
if (bsd->trace)
|
if (bsd->trace)
|
||||||
{
|
{
|
||||||
printf("--- %s 0x%lx retval=%ld\n", io->ddir == DDIR_READ ? "READ" :
|
printf("--- %s 0x%jx retval=%ld\n", io->ddir == DDIR_READ ? "READ" :
|
||||||
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), (uint64_t)io, retval);
|
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), (uint64_t)io, retval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -405,10 +405,11 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||||
bsd->inflight++;
|
bsd->inflight++;
|
||||||
|
|
||||||
uint64_t inode = opt->image ? vitastor_c_inode_get_num(bsd->watch) : opt->inode;
|
uint64_t inode = opt->image ? vitastor_c_inode_get_num(bsd->watch) : opt->inode;
|
||||||
|
assert(io->xfer_buflen < (size_t)-1);
|
||||||
switch (io->ddir)
|
switch (io->ddir)
|
||||||
{
|
{
|
||||||
case DDIR_READ:
|
case DDIR_READ:
|
||||||
iov = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
iov = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
|
||||||
vitastor_c_read(bsd->cli, inode, io->offset, io->xfer_buflen, &iov, 1, read_callback, io);
|
vitastor_c_read(bsd->cli, inode, io->offset, io->xfer_buflen, &iov, 1, read_callback, io);
|
||||||
bsd->last_sync = false;
|
bsd->last_sync = false;
|
||||||
break;
|
break;
|
||||||
|
@ -436,7 +437,7 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||||
io->error = EROFS;
|
io->error = EROFS;
|
||||||
return FIO_Q_COMPLETED;
|
return FIO_Q_COMPLETED;
|
||||||
}
|
}
|
||||||
iov = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
iov = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
|
||||||
vitastor_c_write(bsd->cli, inode, io->offset, io->xfer_buflen, 0, &iov, 1, io_callback, io);
|
vitastor_c_write(bsd->cli, inode, io->offset, io->xfer_buflen, 0, &iov, 1, io_callback, io);
|
||||||
bsd->last_sync = false;
|
bsd->last_sync = false;
|
||||||
break;
|
break;
|
||||||
|
@ -453,11 +454,11 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||||
{
|
{
|
||||||
if (io->ddir == DDIR_SYNC)
|
if (io->ddir == DDIR_SYNC)
|
||||||
{
|
{
|
||||||
printf("+++ SYNC 0x%lx\n", (uint64_t)io);
|
printf("+++ SYNC 0x%jx\n", (uint64_t)io);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("+++ %s 0x%lx 0x%llx+%lx\n",
|
printf("+++ %s 0x%jx 0x%llx+%jx\n",
|
||||||
io->ddir == DDIR_READ ? "READ" : "WRITE",
|
io->ddir == DDIR_READ ? "READ" : "WRITE",
|
||||||
(uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
|
(uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
|
||||||
}
|
}
|
||||||
|
|
|
@ -310,7 +310,8 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||||
int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
|
int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
|
||||||
if (io->ddir == DDIR_WRITE)
|
if (io->ddir == DDIR_WRITE)
|
||||||
{
|
{
|
||||||
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
assert(io->xfer_buflen <= 0x7fffffff);
|
||||||
|
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
|
||||||
wtotal += io->xfer_buflen;
|
wtotal += io->xfer_buflen;
|
||||||
}
|
}
|
||||||
if (sendv_blocking(bsd->connect_fd, iov, iovcnt,
|
if (sendv_blocking(bsd->connect_fd, iov, iovcnt,
|
||||||
|
@ -341,13 +342,13 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||||
read_blocking(bsd->connect_fd, reply.buf, OSD_PACKET_SIZE);
|
read_blocking(bsd->connect_fd, reply.buf, OSD_PACKET_SIZE);
|
||||||
if (reply.hdr.magic != SECONDARY_OSD_REPLY_MAGIC)
|
if (reply.hdr.magic != SECONDARY_OSD_REPLY_MAGIC)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "bad reply: magic = %lx instead of %lx\n", reply.hdr.magic, SECONDARY_OSD_REPLY_MAGIC);
|
fprintf(stderr, "bad reply: magic = %jx instead of %jx\n", reply.hdr.magic, SECONDARY_OSD_REPLY_MAGIC);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
auto it = bsd->queue.find(reply.hdr.id);
|
auto it = bsd->queue.find(reply.hdr.id);
|
||||||
if (it == bsd->queue.end())
|
if (it == bsd->queue.end())
|
||||||
{
|
{
|
||||||
fprintf(stderr, "bad reply: op id %lx missing in local queue\n", reply.hdr.id);
|
fprintf(stderr, "bad reply: op id %jx missing in local queue\n", reply.hdr.id);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
io_u* io = it->second->fio_op;
|
io_u* io = it->second->fio_op;
|
||||||
|
@ -357,7 +358,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||||
{
|
{
|
||||||
if (reply.hdr.retval != io->xfer_buflen)
|
if (reply.hdr.retval != io->xfer_buflen)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Short read: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
fprintf(stderr, "Short read: retval = %jd instead of %ju\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
// Support bitmap
|
// Support bitmap
|
||||||
|
@ -371,7 +372,8 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||||
else
|
else
|
||||||
iov[iovcnt++] = { .iov_base = (void*)(bitmap = (uint64_t)malloc(reply.sec_rw.attr_len)), .iov_len = reply.sec_rw.attr_len };
|
iov[iovcnt++] = { .iov_base = (void*)(bitmap = (uint64_t)malloc(reply.sec_rw.attr_len)), .iov_len = reply.sec_rw.attr_len };
|
||||||
}
|
}
|
||||||
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
assert(io->xfer_buflen <= 0x7FFFFFFF);
|
||||||
|
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
|
||||||
readv_blocking(bsd->connect_fd, iov, iovcnt);
|
readv_blocking(bsd->connect_fd, iov, iovcnt);
|
||||||
if (reply.sec_rw.attr_len > 8)
|
if (reply.sec_rw.attr_len > 8)
|
||||||
{
|
{
|
||||||
|
@ -382,7 +384,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||||
{
|
{
|
||||||
if (reply.hdr.retval != io->xfer_buflen)
|
if (reply.hdr.retval != io->xfer_buflen)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Short write: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
fprintf(stderr, "Short write: retval = %jd instead of %ju\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -390,13 +392,13 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||||
{
|
{
|
||||||
if (reply.hdr.retval != 0)
|
if (reply.hdr.retval != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Sync failed: retval = %ld\n", reply.hdr.retval);
|
fprintf(stderr, "Sync failed: retval = %jd\n", reply.hdr.retval);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (opt->trace)
|
if (opt->trace)
|
||||||
{
|
{
|
||||||
printf("--- %s # %ld\n", io->ddir == DDIR_READ ? "READ" :
|
printf("--- %s # %ju\n", io->ddir == DDIR_READ ? "READ" :
|
||||||
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), reply.hdr.id);
|
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), reply.hdr.id);
|
||||||
}
|
}
|
||||||
bsd->completed.push_back(io);
|
bsd->completed.push_back(io);
|
||||||
|
|
|
@ -11,7 +11,7 @@ inline void* memalign_or_die(size_t alignment, size_t size)
|
||||||
void *buf = memalign(alignment, size);
|
void *buf = memalign(alignment, size);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", size);
|
printf("Failed to allocate %zu bytes\n", size);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
return buf;
|
return buf;
|
||||||
|
@ -22,7 +22,7 @@ inline void* malloc_or_die(size_t size)
|
||||||
void *buf = malloc(size);
|
void *buf = malloc(size);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", size);
|
printf("Failed to allocate %zu bytes\n", size);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
return buf;
|
return buf;
|
||||||
|
@ -33,7 +33,7 @@ inline void* realloc_or_die(void *ptr, size_t size)
|
||||||
void *buf = realloc(ptr, size);
|
void *buf = realloc(ptr, size);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", size);
|
printf("Failed to allocate %zu bytes\n", size);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
return buf;
|
return buf;
|
||||||
|
@ -44,7 +44,7 @@ inline void* calloc_or_die(size_t nmemb, size_t size)
|
||||||
void *buf = calloc(nmemb, size);
|
void *buf = calloc(nmemb, size);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", size * nmemb);
|
printf("Failed to allocate %zu bytes\n", size * nmemb);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
return buf;
|
return buf;
|
||||||
|
|
|
@ -27,13 +27,13 @@ void osd_messenger_t::init()
|
||||||
if (!rdma_context)
|
if (!rdma_context)
|
||||||
{
|
{
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
fprintf(stderr, "[OSD %lu] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
|
fprintf(stderr, "[OSD %ju] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge
|
rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge
|
||||||
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
||||||
fprintf(stderr, "[OSD %lu] RDMA initialized successfully\n", osd_num);
|
fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
|
||||||
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
|
@ -45,11 +45,12 @@ void osd_messenger_t::init()
|
||||||
#endif
|
#endif
|
||||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||||
{
|
{
|
||||||
std::vector<int> to_stop;
|
auto cl_it = clients.begin();
|
||||||
std::vector<osd_op_t*> to_ping;
|
while (cl_it != clients.end())
|
||||||
for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
|
|
||||||
{
|
{
|
||||||
auto cl = cl_it->second;
|
auto cl = cl_it->second;
|
||||||
|
cl_it++;
|
||||||
|
auto peer_fd = cl->peer_fd;
|
||||||
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
|
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
|
||||||
{
|
{
|
||||||
// Do not run keepalive on regular clients
|
// Do not run keepalive on regular clients
|
||||||
|
@ -61,8 +62,10 @@ void osd_messenger_t::init()
|
||||||
if (!cl->ping_time_remaining)
|
if (!cl->ping_time_remaining)
|
||||||
{
|
{
|
||||||
// Ping timed out, stop the client
|
// Ping timed out, stop the client
|
||||||
fprintf(stderr, "Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||||
to_stop.push_back(cl->peer_fd);
|
stop_client(peer_fd, true);
|
||||||
|
// Restart iterator because it may be invalidated
|
||||||
|
cl_it = clients.upper_bound(peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (cl->idle_time_remaining > 0)
|
else if (cl->idle_time_remaining > 0)
|
||||||
|
@ -96,13 +99,15 @@ void osd_messenger_t::init()
|
||||||
delete op;
|
delete op;
|
||||||
if (fail_fd >= 0)
|
if (fail_fd >= 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", fail_osd_num, fail_fd);
|
fprintf(stderr, "Ping failed for OSD %ju (client %d), disconnecting peer\n", fail_osd_num, fail_fd);
|
||||||
stop_client(fail_fd, true);
|
stop_client(fail_fd, true);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
to_ping.push_back(op);
|
|
||||||
cl->ping_time_remaining = osd_ping_timeout;
|
cl->ping_time_remaining = osd_ping_timeout;
|
||||||
cl->idle_time_remaining = osd_idle_timeout;
|
cl->idle_time_remaining = osd_idle_timeout;
|
||||||
|
outbox_push(op);
|
||||||
|
// Restart iterator because it may be invalidated
|
||||||
|
cl_it = clients.upper_bound(peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -110,15 +115,6 @@ void osd_messenger_t::init()
|
||||||
cl->idle_time_remaining = osd_idle_timeout;
|
cl->idle_time_remaining = osd_idle_timeout;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Don't stop clients while a 'clients' iterator is still active
|
|
||||||
for (int peer_fd: to_stop)
|
|
||||||
{
|
|
||||||
stop_client(peer_fd, true);
|
|
||||||
}
|
|
||||||
for (auto op: to_ping)
|
|
||||||
{
|
|
||||||
outbox_push(op);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -257,7 +253,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||||
clients[peer_fd] = new osd_client_t();
|
clients[peer_fd] = new osd_client_t();
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Connecting to OSD %lu at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
|
fprintf(stderr, "Connecting to OSD %ju at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
|
||||||
}
|
}
|
||||||
clients[peer_fd]->peer_addr = addr;
|
clients[peer_fd]->peer_addr = addr;
|
||||||
clients[peer_fd]->peer_port = peer_port;
|
clients[peer_fd]->peer_port = peer_port;
|
||||||
|
@ -323,7 +319,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
||||||
// Stop client
|
// Stop client
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
|
fprintf(stderr, "[OSD %ju] client %d disconnected\n", this->osd_num, peer_fd);
|
||||||
}
|
}
|
||||||
stop_client(peer_fd, true);
|
stop_client(peer_fd, true);
|
||||||
}
|
}
|
||||||
|
@ -349,7 +345,7 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
|
||||||
wp.connecting = false;
|
wp.connecting = false;
|
||||||
if (peer_fd < 0)
|
if (peer_fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
|
fprintf(stderr, "Failed to connect to peer OSD %ju address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
|
||||||
if (wp.address_changed)
|
if (wp.address_changed)
|
||||||
{
|
{
|
||||||
wp.address_changed = false;
|
wp.address_changed = false;
|
||||||
|
@ -376,7 +372,7 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
|
||||||
}
|
}
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "[OSD %lu] Connected with peer OSD %lu (client %d)\n", osd_num, peer_osd, peer_fd);
|
fprintf(stderr, "[OSD %ju] Connected with peer OSD %ju (client %d)\n", osd_num, peer_osd, peer_fd);
|
||||||
}
|
}
|
||||||
wanted_peers.erase(peer_osd);
|
wanted_peers.erase(peer_osd);
|
||||||
repeer_pgs(peer_osd);
|
repeer_pgs(peer_osd);
|
||||||
|
@ -422,7 +418,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
fprintf(stderr, "Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl->osd_num, op->reply.hdr.retval);
|
fprintf(stderr, "Failed to get config from OSD %ju (retval=%jd), disconnecting peer\n", cl->osd_num, op->reply.hdr.retval);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -430,18 +426,18 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
if (json_err != "")
|
if (json_err != "")
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
fprintf(stderr, "Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl->osd_num, json_err.c_str());
|
fprintf(stderr, "Failed to get config from OSD %ju: bad JSON: %s, disconnecting peer\n", cl->osd_num, json_err.c_str());
|
||||||
}
|
}
|
||||||
else if (config["osd_num"].uint64_value() != cl->osd_num)
|
else if (config["osd_num"].uint64_value() != cl->osd_num)
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
fprintf(stderr, "Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl->osd_num);
|
fprintf(stderr, "Connected to OSD %ju instead of OSD %ju, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl->osd_num);
|
||||||
}
|
}
|
||||||
else if (config["protocol_version"].uint64_value() != OSD_PROTOCOL_VERSION)
|
else if (config["protocol_version"].uint64_value() != OSD_PROTOCOL_VERSION)
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "OSD %lu protocol version is %lu, but only version %u is supported.\n"
|
stderr, "OSD %ju protocol version is %ju, but only version %u is supported.\n"
|
||||||
" If you need to upgrade from 0.5.x please request it via the issue tracker.\n",
|
" If you need to upgrade from 0.5.x please request it via the issue tracker.\n",
|
||||||
cl->osd_num, config["protocol_version"].uint64_value(), OSD_PROTOCOL_VERSION
|
cl->osd_num, config["protocol_version"].uint64_value(), OSD_PROTOCOL_VERSION
|
||||||
);
|
);
|
||||||
|
@ -467,7 +463,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
cl->rdma_conn->connect(&addr) != 0)
|
cl->rdma_conn->connect(&addr) != 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to connect to OSD %lu (address %s) using RDMA\n",
|
stderr, "Failed to connect to OSD %ju (address %s) using RDMA\n",
|
||||||
cl->osd_num, config["rdma_address"].string_value().c_str()
|
cl->osd_num, config["rdma_address"].string_value().c_str()
|
||||||
);
|
);
|
||||||
delete cl->rdma_conn;
|
delete cl->rdma_conn;
|
||||||
|
@ -488,7 +484,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
}
|
}
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
|
fprintf(stderr, "Connected to OSD %ju using RDMA\n", cl->osd_num);
|
||||||
}
|
}
|
||||||
cl->peer_state = PEER_RDMA;
|
cl->peer_state = PEER_RDMA;
|
||||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
|
@ -520,7 +516,7 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
||||||
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
|
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
|
||||||
{
|
{
|
||||||
assert(peer_fd != 0);
|
assert(peer_fd != 0);
|
||||||
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
fprintf(stderr, "[OSD %ju] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
||||||
addr_to_string(addr).c_str());
|
addr_to_string(addr).c_str());
|
||||||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
int one = 1;
|
int one = 1;
|
||||||
|
|
|
@ -76,7 +76,7 @@ struct osd_op_buf_list_t
|
||||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
||||||
|
@ -87,7 +87,7 @@ struct osd_op_buf_list_t
|
||||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -109,7 +109,7 @@ struct osd_op_buf_list_t
|
||||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
||||||
|
@ -120,7 +120,7 @@ struct osd_op_buf_list_t
|
||||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,7 @@ std::string msgr_rdma_address_t::to_string()
|
||||||
{
|
{
|
||||||
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
|
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
|
||||||
sprintf(
|
sprintf(
|
||||||
msg, "%04x:%06x:%06x:%016lx%016lx", lid, qpn, psn,
|
msg, "%04x:%06x:%06x:%016jx%016jx", lid, qpn, psn,
|
||||||
htobe64(((uint64_t*)&gid)[0]), htobe64(((uint64_t*)&gid)[1])
|
htobe64(((uint64_t*)&gid)[0]), htobe64(((uint64_t*)&gid)[1])
|
||||||
);
|
);
|
||||||
return std::string(msg);
|
return std::string(msg);
|
||||||
|
@ -20,7 +20,7 @@ bool msgr_rdma_address_t::from_string(const char *str, msgr_rdma_address_t *dest
|
||||||
{
|
{
|
||||||
uint64_t* gid = (uint64_t*)&dest->gid;
|
uint64_t* gid = (uint64_t*)&dest->gid;
|
||||||
int scanned = sscanf(
|
int scanned = sscanf(
|
||||||
str, "%hx:%x:%x:%16lx%16lx", &dest->lid, &dest->qpn, &dest->psn, gid, gid+1
|
str, "%hx:%x:%x:%16jx%16jx", &dest->lid, &dest->qpn, &dest->psn, gid, gid+1
|
||||||
);
|
);
|
||||||
gid[0] = be64toh(gid[0]);
|
gid[0] = be64toh(gid[0]);
|
||||||
gid[1] = be64toh(gid[1]);
|
gid[1] = be64toh(gid[1]);
|
||||||
|
@ -594,7 +594,7 @@ void osd_messenger_t::handle_rdma_events()
|
||||||
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
||||||
if (cl->osd_num)
|
if (cl->osd_num)
|
||||||
{
|
{
|
||||||
fprintf(stderr, " (OSD %lu)", cl->osd_num);
|
fprintf(stderr, " (OSD %ju)", cl->osd_num);
|
||||||
}
|
}
|
||||||
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
|
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
|
||||||
stop_client(client_id);
|
stop_client(client_id);
|
||||||
|
|
|
@ -180,7 +180,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||||
handle_op_hdr(cl);
|
handle_op_hdr(cl);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Received garbage: magic=%lx id=%lu opcode=%lx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -297,7 +297,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||||
if (req_it == cl->sent_ops.end())
|
if (req_it == cl->sent_ops.end())
|
||||||
{
|
{
|
||||||
// Command out of sync. Drop connection
|
// Command out of sync. Drop connection
|
||||||
fprintf(stderr, "Client %d command out of sync: id %lu\n", cl->peer_fd, cl->read_op->req.hdr.id);
|
fprintf(stderr, "Client %d command out of sync: id %ju\n", cl->peer_fd, cl->read_op->req.hdr.id);
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -312,7 +312,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||||
if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len))
|
if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len))
|
||||||
{
|
{
|
||||||
// Check reply length to not overflow the buffer
|
// Check reply length to not overflow the buffer
|
||||||
fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %ld+%u\n",
|
fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %jd+%u\n",
|
||||||
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
|
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
|
||||||
cl->sent_ops[op->req.hdr.id] = op;
|
cl->sent_ops[op->req.hdr.id] = op;
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
|
|
|
@ -61,11 +61,11 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||||
{
|
{
|
||||||
if (cl->osd_num)
|
if (cl->osd_num)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
|
fprintf(stderr, "[OSD %ju] Stopping client %d (OSD peer %ju)\n", osd_num, peer_fd, cl->osd_num);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
fprintf(stderr, "[OSD %ju] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
||||||
|
|
|
@ -738,7 +738,7 @@ protected:
|
||||||
}
|
}
|
||||||
uint64_t handle = *((uint64_t*)cur_req.handle);
|
uint64_t handle = *((uint64_t*)cur_req.handle);
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("request %lx +%x %lx\n", be64toh(cur_req.from), be32toh(cur_req.len), handle);
|
printf("request %jx +%x %jx\n", be64toh(cur_req.from), be32toh(cur_req.len), handle);
|
||||||
#endif
|
#endif
|
||||||
void *buf = NULL;
|
void *buf = NULL;
|
||||||
cluster_op_t *op = new cluster_op_t;
|
cluster_op_t *op = new cluster_op_t;
|
||||||
|
@ -759,7 +759,7 @@ protected:
|
||||||
op->callback = [this, buf, handle](cluster_op_t *op)
|
op->callback = [this, buf, handle](cluster_op_t *op)
|
||||||
{
|
{
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("reply %lx e=%d\n", handle, op->retval);
|
printf("reply %jx e=%d\n", handle, op->retval);
|
||||||
#endif
|
#endif
|
||||||
nbd_reply *reply = (nbd_reply*)buf;
|
nbd_reply *reply = (nbd_reply*)buf;
|
||||||
reply->magic = htobe32(NBD_REPLY_MAGIC);
|
reply->magic = htobe32(NBD_REPLY_MAGIC);
|
||||||
|
@ -769,7 +769,7 @@ protected:
|
||||||
if (op->retval < 0 || op->opcode != OSD_OP_READ)
|
if (op->retval < 0 || op->opcode != OSD_OP_READ)
|
||||||
to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) });
|
to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) });
|
||||||
else
|
else
|
||||||
to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) + op->len });
|
to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) + (size_t)op->len });
|
||||||
to_free.push_back(buf);
|
to_free.push_back(buf);
|
||||||
delete op;
|
delete op;
|
||||||
ringloop->wakeup();
|
ringloop->wakeup();
|
||||||
|
|
|
@ -78,7 +78,7 @@ std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename)
|
||||||
{
|
{
|
||||||
// encode as: d <length> <hex dir_ino> / <filename>
|
// encode as: d <length> <hex dir_ino> / <filename>
|
||||||
char key[24] = { 0 };
|
char key[24] = { 0 };
|
||||||
snprintf(key, sizeof(key), "d-%lx/", dir_ino);
|
snprintf(key, sizeof(key), "d-%jx/", dir_ino);
|
||||||
int n = strnlen(key, sizeof(key)-1) - 3;
|
int n = strnlen(key, sizeof(key)-1) - 3;
|
||||||
if (n < 10)
|
if (n < 10)
|
||||||
key[1] = '0'+n;
|
key[1] = '0'+n;
|
||||||
|
@ -99,7 +99,7 @@ std::string kv_direntry_filename(const std::string & key)
|
||||||
std::string kv_inode_key(uint64_t ino)
|
std::string kv_inode_key(uint64_t ino)
|
||||||
{
|
{
|
||||||
char key[24] = { 0 };
|
char key[24] = { 0 };
|
||||||
snprintf(key, sizeof(key), "i-%lx", ino);
|
snprintf(key, sizeof(key), "i-%jx", ino);
|
||||||
int n = strnlen(key, sizeof(key)-1) - 2;
|
int n = strnlen(key, sizeof(key)-1) - 2;
|
||||||
if (n < 10)
|
if (n < 10)
|
||||||
key[1] = '0'+n;
|
key[1] = '0'+n;
|
||||||
|
|
|
@ -87,7 +87,7 @@ struct kv_create_state
|
||||||
static void kv_do_create(kv_create_state *st)
|
static void kv_do_create(kv_create_state *st)
|
||||||
{
|
{
|
||||||
if (st->self->parent->trace)
|
if (st->self->parent->trace)
|
||||||
fprintf(stderr, "[%d] CREATE %lu/%s ATTRS %s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str(), json11::Json(st->attrobj).dump().c_str());
|
fprintf(stderr, "[%d] CREATE %ju/%s ATTRS %s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str(), json11::Json(st->attrobj).dump().c_str());
|
||||||
if (st->filename == "" || st->filename.find("/") != std::string::npos)
|
if (st->filename == "" || st->filename.find("/") != std::string::npos)
|
||||||
{
|
{
|
||||||
auto cb = std::move(st->cb);
|
auto cb = std::move(st->cb);
|
||||||
|
@ -128,7 +128,7 @@ static void kv_do_create(kv_create_state *st)
|
||||||
res = -EEXIST;
|
res = -EEXIST;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
fprintf(stderr, "create %lu/%s failed: %s (code %d)\n", st->dir_ino, st->filename.c_str(), strerror(-res), res);
|
fprintf(stderr, "create %ju/%s failed: %s (code %d)\n", st->dir_ino, st->filename.c_str(), strerror(-res), res);
|
||||||
auto cb = std::move(st->cb);
|
auto cb = std::move(st->cb);
|
||||||
cb(res);
|
cb(res);
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,7 +48,7 @@ int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
||||||
std::string fh = args->object;
|
std::string fh = args->object;
|
||||||
auto ino = kv_fh_inode(fh);
|
auto ino = kv_fh_inode(fh);
|
||||||
if (self->parent->trace)
|
if (self->parent->trace)
|
||||||
fprintf(stderr, "[%d] GETATTR %lu\n", self->nfs_fd, ino);
|
fprintf(stderr, "[%d] GETATTR %ju\n", self->nfs_fd, ino);
|
||||||
if (!kv_fh_valid(fh))
|
if (!kv_fh_valid(fh))
|
||||||
{
|
{
|
||||||
*reply = (GETATTR3res){ .status = NFS3ERR_INVAL };
|
*reply = (GETATTR3res){ .status = NFS3ERR_INVAL };
|
||||||
|
|
|
@ -134,7 +134,7 @@ resume_3:
|
||||||
resume_4:
|
resume_4:
|
||||||
if (st->res2 < 0)
|
if (st->res2 < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Warning: failed to delete new linked direntry %lu/%s: %s (code %d)\n",
|
fprintf(stderr, "Warning: failed to delete new linked direntry %ju/%s: %s (code %d)\n",
|
||||||
st->dir_ino, st->filename.c_str(), strerror(-st->res2), st->res2);
|
st->dir_ino, st->filename.c_str(), strerror(-st->res2), st->res2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -153,7 +153,7 @@ int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop)
|
||||||
st->dir_ino = kv_fh_inode(args->link.dir);
|
st->dir_ino = kv_fh_inode(args->link.dir);
|
||||||
st->filename = args->link.name;
|
st->filename = args->link.name;
|
||||||
if (st->self->parent->trace)
|
if (st->self->parent->trace)
|
||||||
fprintf(stderr, "[%d] LINK %lu -> %lu/%s\n", st->self->nfs_fd, st->ino, st->dir_ino, st->filename.c_str());
|
fprintf(stderr, "[%d] LINK %ju -> %ju/%s\n", st->self->nfs_fd, st->ino, st->dir_ino, st->filename.c_str());
|
||||||
if (!st->ino || !st->dir_ino || st->filename == "")
|
if (!st->ino || !st->dir_ino || st->filename == "")
|
||||||
{
|
{
|
||||||
LINK3res *reply = (LINK3res*)rop->reply;
|
LINK3res *reply = (LINK3res*)rop->reply;
|
||||||
|
|
|
@ -16,7 +16,7 @@ int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
|
||||||
inode_t dir_ino = kv_fh_inode(args->what.dir);
|
inode_t dir_ino = kv_fh_inode(args->what.dir);
|
||||||
std::string filename = args->what.name;
|
std::string filename = args->what.name;
|
||||||
if (self->parent->trace)
|
if (self->parent->trace)
|
||||||
fprintf(stderr, "[%d] LOOKUP %lu/%s\n", self->nfs_fd, dir_ino, filename.c_str());
|
fprintf(stderr, "[%d] LOOKUP %ju/%s\n", self->nfs_fd, dir_ino, filename.c_str());
|
||||||
if (!dir_ino || filename == "")
|
if (!dir_ino || filename == "")
|
||||||
{
|
{
|
||||||
*reply = (LOOKUP3res){ .status = NFS3ERR_INVAL };
|
*reply = (LOOKUP3res){ .status = NFS3ERR_INVAL };
|
||||||
|
@ -70,7 +70,7 @@ int kv_nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
|
||||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||||
READLINK3args *args = (READLINK3args*)rop->request;
|
READLINK3args *args = (READLINK3args*)rop->request;
|
||||||
if (self->parent->trace)
|
if (self->parent->trace)
|
||||||
fprintf(stderr, "[%d] READLINK %lu\n", self->nfs_fd, kv_fh_inode(args->symlink));
|
fprintf(stderr, "[%d] READLINK %ju\n", self->nfs_fd, kv_fh_inode(args->symlink));
|
||||||
READLINK3res *reply = (READLINK3res*)rop->reply;
|
READLINK3res *reply = (READLINK3res*)rop->reply;
|
||||||
if (!kv_fh_valid(args->symlink) || args->symlink == NFS_ROOT_HANDLE)
|
if (!kv_fh_valid(args->symlink) || args->symlink == NFS_ROOT_HANDLE)
|
||||||
{
|
{
|
||||||
|
|
|
@ -333,7 +333,7 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
||||||
st->maxcount = args->count;
|
st->maxcount = args->count;
|
||||||
}
|
}
|
||||||
if (st->self->parent->trace)
|
if (st->self->parent->trace)
|
||||||
fprintf(stderr, "[%d] READDIR %lu VERF %lx OFFSET %lu LIMIT %lu\n", st->self->nfs_fd, st->dir_ino, st->cookieverf, st->cookie, st->maxcount);
|
fprintf(stderr, "[%d] READDIR %ju VERF %jx OFFSET %ju LIMIT %ju\n", st->self->nfs_fd, st->dir_ino, st->cookieverf, st->cookie, st->maxcount);
|
||||||
st->cb = [st](int res)
|
st->cb = [st](int res)
|
||||||
{
|
{
|
||||||
if (st->is_plus)
|
if (st->is_plus)
|
||||||
|
|
|
@ -177,7 +177,7 @@ resume_5:
|
||||||
{
|
{
|
||||||
fprintf(stderr, "failed to restore direntry %s (%s): %s (code %d)",
|
fprintf(stderr, "failed to restore direntry %s (%s): %s (code %d)",
|
||||||
kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), strerror(-st->res2), st->res2);
|
kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), strerror(-st->res2), st->res2);
|
||||||
fprintf(stderr, " - inode %lu may be left as garbage\n", st->ino);
|
fprintf(stderr, " - inode %ju may be left as garbage\n", st->ino);
|
||||||
}
|
}
|
||||||
if (st->res < 0)
|
if (st->res < 0)
|
||||||
{
|
{
|
||||||
|
@ -235,7 +235,7 @@ resume_6:
|
||||||
{
|
{
|
||||||
if (r.err)
|
if (r.err)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to remove inode %lx data: %s (code %d)\n",
|
fprintf(stderr, "Failed to remove inode %jx data: %s (code %d)\n",
|
||||||
st->ino, r.text.c_str(), r.err);
|
st->ino, r.text.c_str(), r.err);
|
||||||
}
|
}
|
||||||
st->res = r.err;
|
st->res = r.err;
|
||||||
|
@ -261,7 +261,7 @@ int kv_nfs3_remove_proc(void *opaque, rpc_op_t *rop)
|
||||||
st->dir_ino = kv_fh_inode(args->object.dir);
|
st->dir_ino = kv_fh_inode(args->object.dir);
|
||||||
st->filename = args->object.name;
|
st->filename = args->object.name;
|
||||||
if (st->self->parent->trace)
|
if (st->self->parent->trace)
|
||||||
fprintf(stderr, "[%d] REMOVE %lu/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
|
fprintf(stderr, "[%d] REMOVE %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
|
||||||
if (!st->dir_ino)
|
if (!st->dir_ino)
|
||||||
{
|
{
|
||||||
*reply = (REMOVE3res){ .status = NFS3ERR_INVAL };
|
*reply = (REMOVE3res){ .status = NFS3ERR_INVAL };
|
||||||
|
@ -292,7 +292,7 @@ int kv_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
|
||||||
st->filename = args->object.name;
|
st->filename = args->object.name;
|
||||||
st->is_rmdir = true;
|
st->is_rmdir = true;
|
||||||
if (st->self->parent->trace)
|
if (st->self->parent->trace)
|
||||||
fprintf(stderr, "[%d] RMDIR %lu/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
|
fprintf(stderr, "[%d] RMDIR %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
|
||||||
if (!st->dir_ino)
|
if (!st->dir_ino)
|
||||||
{
|
{
|
||||||
*reply = (RMDIR3res){ .status = NFS3ERR_INVAL };
|
*reply = (RMDIR3res){ .status = NFS3ERR_INVAL };
|
||||||
|
|
|
@ -177,7 +177,7 @@ int kv_nfs3_rename_proc(void *opaque, rpc_op_t *rop)
|
||||||
st->old_name = args->from.name;
|
st->old_name = args->from.name;
|
||||||
st->new_name = args->to.name;
|
st->new_name = args->to.name;
|
||||||
if (st->self->parent->trace)
|
if (st->self->parent->trace)
|
||||||
fprintf(stderr, "[%d] RENAME %lu/%s -> %lu/%s\n", st->self->nfs_fd, st->old_dir_ino, st->old_name.c_str(), st->new_dir_ino, st->new_name.c_str());
|
fprintf(stderr, "[%d] RENAME %ju/%s -> %ju/%s\n", st->self->nfs_fd, st->old_dir_ino, st->old_name.c_str(), st->new_dir_ino, st->new_name.c_str());
|
||||||
if (!st->old_dir_ino || !st->new_dir_ino || st->old_name == "" || st->new_name == "")
|
if (!st->old_dir_ino || !st->new_dir_ino || st->old_name == "" || st->new_name == "")
|
||||||
{
|
{
|
||||||
RENAME3res *reply = (RENAME3res*)rop->reply;
|
RENAME3res *reply = (RENAME3res*)rop->reply;
|
||||||
|
|
|
@ -93,7 +93,7 @@ resume_2:
|
||||||
}
|
}
|
||||||
if (st->res < 0)
|
if (st->res < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to update inode %lu: %s (code %d)\n", st->ino, strerror(-st->res), st->res);
|
fprintf(stderr, "Failed to update inode %ju: %s (code %d)\n", st->ino, strerror(-st->res), st->res);
|
||||||
auto cb = std::move(st->cb);
|
auto cb = std::move(st->cb);
|
||||||
cb(st->res);
|
cb(st->res);
|
||||||
return;
|
return;
|
||||||
|
@ -110,7 +110,7 @@ resume_2:
|
||||||
{
|
{
|
||||||
if (r.err)
|
if (r.err)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to truncate inode %lu: %s (code %d)\n",
|
fprintf(stderr, "Failed to truncate inode %ju: %s (code %d)\n",
|
||||||
st->ino, r.text.c_str(), r.err);
|
st->ino, r.text.c_str(), r.err);
|
||||||
}
|
}
|
||||||
st->res = r.err;
|
st->res = r.err;
|
||||||
|
@ -151,7 +151,7 @@ int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
|
||||||
st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime);
|
st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime);
|
||||||
if (args->new_attributes.mtime.set_it)
|
if (args->new_attributes.mtime.set_it)
|
||||||
st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime);
|
st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime);
|
||||||
fprintf(stderr, "SETATTR %lu ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str());
|
fprintf(stderr, "SETATTR %ju ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str());
|
||||||
st->cb = [st](int res)
|
st->cb = [st](int res)
|
||||||
{
|
{
|
||||||
auto reply = (SETATTR3res*)st->rop->reply;
|
auto reply = (SETATTR3res*)st->rop->reply;
|
||||||
|
|
|
@ -354,7 +354,7 @@ void nfs_proxy_t::parse_stats(etcd_kv_t & kv)
|
||||||
pool_id_t pool_id = 0;
|
pool_id_t pool_id = 0;
|
||||||
inode_t inode_num = 0;
|
inode_t inode_num = 0;
|
||||||
char null_byte = 0;
|
char null_byte = 0;
|
||||||
int scanned = sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode_num, &null_byte);
|
int scanned = sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+13, "%u/%ju%c", &pool_id, &inode_num, &null_byte);
|
||||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num)
|
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||||
|
@ -394,7 +394,7 @@ void nfs_proxy_t::check_default_pool()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "There are %lu pools. Please select default pool with --pool option\n", cli->st_cli.pool_config.size());
|
fprintf(stderr, "There are %zu pools. Please select default pool with --pool option\n", cli->st_cli.pool_config.size());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
38
src/osd.cpp
38
src/osd.cpp
|
@ -233,6 +233,8 @@ void osd_t::parse_config(bool init)
|
||||||
? 10 : config["recovery_tune_agg_interval"].uint64_value();
|
? 10 : config["recovery_tune_agg_interval"].uint64_value();
|
||||||
recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
|
recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
|
||||||
? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
|
? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
|
||||||
|
recovery_tune_sleep_cutoff_us = config["recovery_tune_sleep_cutoff_us"].is_null()
|
||||||
|
? 10000000 : config["recovery_tune_sleep_cutoff_us"].uint64_value();
|
||||||
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
||||||
if (recovery_pg_switch < 1)
|
if (recovery_pg_switch < 1)
|
||||||
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||||
|
@ -473,14 +475,14 @@ void osd_t::print_stats()
|
||||||
if (msgr.stats.op_stat_bytes[i] != 0)
|
if (msgr.stats.op_stat_bytes[i] != 0)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
|
"[OSD %ju] avg latency for op %d (%s): %ju us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
|
||||||
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
|
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
|
||||||
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
|
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
|
printf("[OSD %ju] avg latency for op %d (%s): %ju us\n", osd_num, i, osd_op_names[i], avg);
|
||||||
}
|
}
|
||||||
prev_stats.op_stat_count[i] = msgr.stats.op_stat_count[i];
|
prev_stats.op_stat_count[i] = msgr.stats.op_stat_count[i];
|
||||||
prev_stats.op_stat_sum[i] = msgr.stats.op_stat_sum[i];
|
prev_stats.op_stat_sum[i] = msgr.stats.op_stat_sum[i];
|
||||||
|
@ -492,7 +494,7 @@ void osd_t::print_stats()
|
||||||
if (msgr.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
|
if (msgr.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
|
||||||
{
|
{
|
||||||
uint64_t avg = (msgr.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(msgr.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
|
uint64_t avg = (msgr.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(msgr.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
|
||||||
printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
|
printf("[OSD %ju] avg latency for subop %d (%s): %jd us\n", osd_num, i, osd_op_names[i], avg);
|
||||||
prev_stats.subop_stat_count[i] = msgr.stats.subop_stat_count[i];
|
prev_stats.subop_stat_count[i] = msgr.stats.subop_stat_count[i];
|
||||||
prev_stats.subop_stat_sum[i] = msgr.stats.subop_stat_sum[i];
|
prev_stats.subop_stat_sum[i] = msgr.stats.subop_stat_sum[i];
|
||||||
}
|
}
|
||||||
|
@ -503,7 +505,7 @@ void osd_t::print_stats()
|
||||||
{
|
{
|
||||||
uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
|
uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg latency %ld us, delay %ld us\n", osd_num, recovery_stat_names[i],
|
"[OSD %ju] %s recovery: %.1f op/s, B/W: %.2f %s, avg latency %jd us, delay %jd us\n", osd_num, recovery_stat_names[i],
|
||||||
(recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
|
(recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
|
||||||
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
|
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
|
||||||
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
|
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
|
||||||
|
@ -515,19 +517,19 @@ void osd_t::print_stats()
|
||||||
memcpy(recovery_print_prev, recovery_stat, sizeof(recovery_stat));
|
memcpy(recovery_print_prev, recovery_stat, sizeof(recovery_stat));
|
||||||
if (corrupted_objects > 0)
|
if (corrupted_objects > 0)
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
|
printf("[OSD %ju] %ju object(s) corrupted\n", osd_num, corrupted_objects);
|
||||||
}
|
}
|
||||||
if (incomplete_objects > 0)
|
if (incomplete_objects > 0)
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
|
printf("[OSD %ju] %ju object(s) incomplete\n", osd_num, incomplete_objects);
|
||||||
}
|
}
|
||||||
if (degraded_objects > 0)
|
if (degraded_objects > 0)
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] %lu object(s) degraded\n", osd_num, degraded_objects);
|
printf("[OSD %ju] %ju object(s) degraded\n", osd_num, degraded_objects);
|
||||||
}
|
}
|
||||||
if (misplaced_objects > 0)
|
if (misplaced_objects > 0)
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
|
printf("[OSD %ju] %ju object(s) misplaced\n", osd_num, misplaced_objects);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -546,27 +548,27 @@ void osd_t::print_slow()
|
||||||
int l = sizeof(alloc), n;
|
int l = sizeof(alloc), n;
|
||||||
char *buf = alloc;
|
char *buf = alloc;
|
||||||
#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
|
#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
|
||||||
bufprintf("[OSD %lu] Slow op %lx", osd_num, (unsigned long)op);
|
bufprintf("[OSD %ju] Slow op %jx", osd_num, (uint64_t)op);
|
||||||
if (kv.second->osd_num)
|
if (kv.second->osd_num)
|
||||||
{
|
{
|
||||||
bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
|
bufprintf(" from peer OSD %ju (client %d)", kv.second->osd_num, kv.second->peer_fd);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bufprintf(" from client %d", kv.second->peer_fd);
|
bufprintf(" from client %d", kv.second->peer_fd);
|
||||||
}
|
}
|
||||||
bufprintf(": %s id=%lu", osd_op_names[op->req.hdr.opcode], op->req.hdr.id);
|
bufprintf(": %s id=%ju", osd_op_names[op->req.hdr.opcode], op->req.hdr.id);
|
||||||
if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||||
op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE)
|
op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE)
|
||||||
{
|
{
|
||||||
bufprintf(" %lx:%lx v", op->req.sec_rw.oid.inode, op->req.sec_rw.oid.stripe);
|
bufprintf(" %jx:%jx v", op->req.sec_rw.oid.inode, op->req.sec_rw.oid.stripe);
|
||||||
if (op->req.sec_rw.version == UINT64_MAX)
|
if (op->req.sec_rw.version == UINT64_MAX)
|
||||||
{
|
{
|
||||||
bufprintf("%s", "max");
|
bufprintf("%s", "max");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bufprintf("%lu", op->req.sec_rw.version);
|
bufprintf("%ju", op->req.sec_rw.version);
|
||||||
}
|
}
|
||||||
if (op->req.hdr.opcode != OSD_OP_SEC_DELETE)
|
if (op->req.hdr.opcode != OSD_OP_SEC_DELETE)
|
||||||
{
|
{
|
||||||
|
@ -578,17 +580,17 @@ void osd_t::print_slow()
|
||||||
for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
|
for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
|
||||||
{
|
{
|
||||||
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
|
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
|
||||||
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
|
bufprintf(i == 0 ? " %jx:%jx v%ju" : ", %jx:%jx v%ju", ov->oid.inode, ov->oid.stripe, ov->version);
|
||||||
}
|
}
|
||||||
if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
|
if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
|
||||||
{
|
{
|
||||||
bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
|
bufprintf(", ... (%ju items)", op->req.sec_stab.len/sizeof(obj_ver_id));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||||
{
|
{
|
||||||
bufprintf(
|
bufprintf(
|
||||||
" oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
|
" oid=%jx/%jx-%jx/%jx pg=%u/%u, stripe=%ju, limit=%u",
|
||||||
op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
|
op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
|
||||||
op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
|
op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
|
||||||
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
|
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
|
||||||
|
@ -598,7 +600,7 @@ void osd_t::print_slow()
|
||||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||||
op->req.hdr.opcode == OSD_OP_DELETE)
|
op->req.hdr.opcode == OSD_OP_DELETE)
|
||||||
{
|
{
|
||||||
bufprintf(" inode=%lx offset=%lx len=%x", op->req.rw.inode, op->req.rw.offset, op->req.rw.len);
|
bufprintf(" inode=%jx offset=%jx len=%x", op->req.rw.inode, op->req.rw.offset, op->req.rw.len);
|
||||||
}
|
}
|
||||||
if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||||
op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE ||
|
op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE ||
|
||||||
|
@ -610,7 +612,7 @@ void osd_t::print_slow()
|
||||||
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
|
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
|
||||||
if (wait_for)
|
if (wait_for)
|
||||||
{
|
{
|
||||||
bufprintf(" wait=%d (detail=%lu)", wait_for, PRIV(op->bs_op)->wait_detail);
|
bufprintf(" wait=%d (detail=%ju)", wait_for, PRIV(op->bs_op)->wait_detail);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||||
|
|
|
@ -125,6 +125,7 @@ class osd_t
|
||||||
int recovery_tune_interval = 1;
|
int recovery_tune_interval = 1;
|
||||||
int recovery_tune_agg_interval = 10;
|
int recovery_tune_agg_interval = 10;
|
||||||
int recovery_tune_sleep_min_us = 10;
|
int recovery_tune_sleep_min_us = 10;
|
||||||
|
int recovery_tune_sleep_cutoff_us = 10000000;
|
||||||
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
int inode_vanish_time = 60;
|
int inode_vanish_time = 60;
|
||||||
|
@ -282,6 +283,7 @@ class osd_t
|
||||||
void exec_sync_stab_all(osd_op_t *cur_op);
|
void exec_sync_stab_all(osd_op_t *cur_op);
|
||||||
void exec_show_config(osd_op_t *cur_op);
|
void exec_show_config(osd_op_t *cur_op);
|
||||||
void exec_secondary(osd_op_t *cur_op);
|
void exec_secondary(osd_op_t *cur_op);
|
||||||
|
void exec_secondary_real(osd_op_t *cur_op);
|
||||||
void secondary_op_callback(osd_op_t *cur_op);
|
void secondary_op_callback(osd_op_t *cur_op);
|
||||||
|
|
||||||
// primary ops
|
// primary ops
|
||||||
|
|
|
@ -117,7 +117,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||||
conf["immediate_commit"].is_null())
|
conf["immediate_commit"].is_null())
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] Warning: peer OSD %lu does not report block_size/bitmap_granularity/immediate_commit."
|
"[OSD %ju] Warning: peer OSD %ju does not report block_size/bitmap_granularity/immediate_commit."
|
||||||
" Is it older than 0.6.3?\n", this->osd_num, cl->osd_num
|
" Is it older than 0.6.3?\n", this->osd_num, cl->osd_num
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -129,7 +129,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||||
immediate_commit == IMMEDIATE_SMALL && peer_immediate_commit == IMMEDIATE_NONE)
|
immediate_commit == IMMEDIATE_SMALL && peer_immediate_commit == IMMEDIATE_NONE)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] My immediate_commit is \"%s\", but peer OSD %lu has \"%s\". We can't work together\n",
|
"[OSD %ju] My immediate_commit is \"%s\", but peer OSD %ju has \"%s\". We can't work together\n",
|
||||||
this->osd_num, immediate_commit == IMMEDIATE_ALL ? "all" : "small",
|
this->osd_num, immediate_commit == IMMEDIATE_ALL ? "all" : "small",
|
||||||
cl->osd_num, conf["immediate_commit"].string_value().c_str()
|
cl->osd_num, conf["immediate_commit"].string_value().c_str()
|
||||||
);
|
);
|
||||||
|
@ -138,7 +138,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||||
else if (conf["block_size"].uint64_value() != (uint64_t)this->bs_block_size)
|
else if (conf["block_size"].uint64_value() != (uint64_t)this->bs_block_size)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] My block_size is %u, but peer OSD %lu has %lu. We can't work together\n",
|
"[OSD %ju] My block_size is %u, but peer OSD %ju has %ju. We can't work together\n",
|
||||||
this->osd_num, this->bs_block_size, cl->osd_num, conf["block_size"].uint64_value()
|
this->osd_num, this->bs_block_size, cl->osd_num, conf["block_size"].uint64_value()
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
|
@ -146,7 +146,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||||
else if (conf["bitmap_granularity"].uint64_value() != (uint64_t)this->bs_bitmap_granularity)
|
else if (conf["bitmap_granularity"].uint64_value() != (uint64_t)this->bs_bitmap_granularity)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] My bitmap_granularity is %u, but peer OSD %lu has %lu. We can't work together\n",
|
"[OSD %ju] My bitmap_granularity is %u, but peer OSD %ju has %ju. We can't work together\n",
|
||||||
this->osd_num, this->bs_bitmap_granularity, cl->osd_num, conf["bitmap_granularity"].uint64_value()
|
this->osd_num, this->bs_bitmap_granularity, cl->osd_num, conf["bitmap_granularity"].uint64_value()
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
|
@ -181,7 +181,7 @@ json11::Json osd_t::get_statistics()
|
||||||
timespec ts;
|
timespec ts;
|
||||||
clock_gettime(CLOCK_REALTIME, &ts);
|
clock_gettime(CLOCK_REALTIME, &ts);
|
||||||
char time_str[50] = { 0 };
|
char time_str[50] = { 0 };
|
||||||
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
sprintf(time_str, "%jd.%03ld", (uint64_t)ts.tv_sec, ts.tv_nsec/1000000);
|
||||||
st["time"] = time_str;
|
st["time"] = time_str;
|
||||||
if (bs)
|
if (bs)
|
||||||
{
|
{
|
||||||
|
@ -358,7 +358,7 @@ void osd_t::report_statistics()
|
||||||
etcd_reporting_stats = false;
|
etcd_reporting_stats = false;
|
||||||
if (err != "")
|
if (err != "")
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, err.c_str());
|
printf("[OSD %ju] Error reporting state to etcd: %s\n", this->osd_num, err.c_str());
|
||||||
// Retry indefinitely
|
// Retry indefinitely
|
||||||
tfd->set_timer(st_cli.etcd_slow_timeout, false, [this](int timer_id)
|
tfd->set_timer(st_cli.etcd_slow_timeout, false, [this](int timer_id)
|
||||||
{
|
{
|
||||||
|
@ -367,7 +367,7 @@ void osd_t::report_statistics()
|
||||||
}
|
}
|
||||||
else if (res["error"].string_value() != "")
|
else if (res["error"].string_value() != "")
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, res["error"].string_value().c_str());
|
printf("[OSD %ju] Error reporting state to etcd: %s\n", this->osd_num, res["error"].string_value().c_str());
|
||||||
force_stop(1);
|
force_stop(1);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -432,7 +432,7 @@ void osd_t::acquire_lease()
|
||||||
create_osd_state();
|
create_osd_state();
|
||||||
});
|
});
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] reporting to etcd at %s every %d seconds (statistics every %d seconds)\n", this->osd_num,
|
"[OSD %ju] reporting to etcd at %s every %d seconds (statistics every %d seconds)\n", this->osd_num,
|
||||||
(config["etcd_address"].is_string() ? config["etcd_address"].string_value() : config["etcd_address"].dump()).c_str(),
|
(config["etcd_address"].is_string() ? config["etcd_address"].string_value() : config["etcd_address"].dump()).c_str(),
|
||||||
etcd_report_interval, etcd_stats_interval
|
etcd_report_interval, etcd_stats_interval
|
||||||
);
|
);
|
||||||
|
@ -499,11 +499,11 @@ void osd_t::create_osd_state()
|
||||||
{
|
{
|
||||||
// OSD is already up
|
// OSD is already up
|
||||||
auto kv = st_cli.parse_etcd_kv(data["responses"][0]["response_range"]["kvs"][0]);
|
auto kv = st_cli.parse_etcd_kv(data["responses"][0]["response_range"]["kvs"][0]);
|
||||||
printf("Key %s already exists in etcd, OSD %lu is still up\n", kv.key.c_str(), this->osd_num);
|
printf("Key %s already exists in etcd, OSD %ju is still up\n", kv.key.c_str(), this->osd_num);
|
||||||
int64_t port = kv.value["port"].int64_value();
|
int64_t port = kv.value["port"].int64_value();
|
||||||
for (auto & addr: kv.value["addresses"].array_items())
|
for (auto & addr: kv.value["addresses"].array_items())
|
||||||
{
|
{
|
||||||
printf(" listening at: %s:%ld\n", addr.string_value().c_str(), port);
|
printf(" listening at: %s:%jd\n", addr.string_value().c_str(), port);
|
||||||
}
|
}
|
||||||
force_stop(0);
|
force_stop(0);
|
||||||
return;
|
return;
|
||||||
|
@ -569,13 +569,13 @@ void osd_t::force_stop(int exitcode)
|
||||||
{
|
{
|
||||||
printf("Error revoking etcd lease: %s\n", err.c_str());
|
printf("Error revoking etcd lease: %s\n", err.c_str());
|
||||||
}
|
}
|
||||||
printf("[OSD %lu] Force stopping\n", this->osd_num);
|
printf("[OSD %ju] Force stopping\n", this->osd_num);
|
||||||
exit(exitcode);
|
exit(exitcode);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] Force stopping\n", this->osd_num);
|
printf("[OSD %ju] Force stopping\n", this->osd_num);
|
||||||
exit(exitcode);
|
exit(exitcode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -629,7 +629,7 @@ void osd_t::apply_pg_count()
|
||||||
if (still_active > 0)
|
if (still_active > 0)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] PG count change detected for pool %u (new is %lu, old is %u),"
|
"[OSD %ju] PG count change detected for pool %u (new is %ju, old is %u),"
|
||||||
" but %u PG(s) are still active. This is not allowed. Exiting\n",
|
" but %u PG(s) are still active. This is not allowed. Exiting\n",
|
||||||
this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
|
this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
|
||||||
);
|
);
|
||||||
|
@ -663,7 +663,7 @@ void osd_t::apply_pg_config()
|
||||||
if (!warned_block_size)
|
if (!warned_block_size)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] My block_size and bitmap_granularity are %u/%u"
|
"[OSD %ju] My block_size and bitmap_granularity are %u/%u"
|
||||||
", but pool %u has %u/%u. Refusing to start PGs of this pool\n",
|
", but pool %u has %u/%u. Refusing to start PGs of this pool\n",
|
||||||
this->osd_num, bs_block_size, bs_bitmap_granularity,
|
this->osd_num, bs_block_size, bs_bitmap_granularity,
|
||||||
pool_id, pool_item.second.data_block_size, pool_item.second.bitmap_granularity
|
pool_id, pool_item.second.data_block_size, pool_item.second.bitmap_granularity
|
||||||
|
@ -843,7 +843,13 @@ void osd_t::report_pg_states()
|
||||||
pg_state_exists = true;
|
pg_state_exists = true;
|
||||||
if (pg.state == PG_OFFLINE && pg_it->second.cur_primary != this->osd_num)
|
if (pg.state == PG_OFFLINE && pg_it->second.cur_primary != this->osd_num)
|
||||||
{
|
{
|
||||||
// Nothing to check or report, PG is already taken over by another OSD
|
// Nothing to report, PG is already taken over by another OSD
|
||||||
|
checks.push_back(json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", state_key_base64 },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", st_cli.etcd_watch_revision+1 },
|
||||||
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -851,11 +857,6 @@ void osd_t::report_pg_states()
|
||||||
}
|
}
|
||||||
if (!pg_state_exists)
|
if (!pg_state_exists)
|
||||||
{
|
{
|
||||||
if (pg.state == PG_OFFLINE)
|
|
||||||
{
|
|
||||||
// Nothing to check or report, PG is already stopped
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Check that the PG key does not exist
|
// Check that the PG key does not exist
|
||||||
// Failed check indicates an unsuccessful PG lock attempt in this case
|
// Failed check indicates an unsuccessful PG lock attempt in this case
|
||||||
checks.push_back(json11::Json::object {
|
checks.push_back(json11::Json::object {
|
||||||
|
@ -984,7 +985,7 @@ void osd_t::report_pg_states()
|
||||||
kv.value["primary"].uint64_value() != this->osd_num)
|
kv.value["primary"].uint64_value() != this->osd_num)
|
||||||
{
|
{
|
||||||
// PG is somehow captured by another OSD
|
// PG is somehow captured by another OSD
|
||||||
printf("BUG: OSD %lu captured our PG %u/%u. Race condition detected, exiting\n",
|
printf("BUG: OSD %ju captured our PG %u/%u. Race condition detected, exiting\n",
|
||||||
kv.value["primary"].uint64_value(), pool_id, pg_num);
|
kv.value["primary"].uint64_value(), pool_id, pg_num);
|
||||||
force_stop(1);
|
force_stop(1);
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -66,7 +66,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||||
{
|
{
|
||||||
if (log_level > 2)
|
if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf("[PG %u/%u] flush batch %lx completed on OSD %lu with result %d\n",
|
printf("[PG %u/%u] flush batch %jx completed on OSD %ju with result %d\n",
|
||||||
pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
|
pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
|
||||||
}
|
}
|
||||||
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
|
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
|
||||||
|
@ -88,7 +88,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
|
printf("Error while doing flush on OSD %ju: %d (%s)\n", osd_num, retval, strerror(-retval));
|
||||||
auto fd_it = msgr.osd_peer_fds.find(peer_osd);
|
auto fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||||
if (fd_it != msgr.osd_peer_fds.end())
|
if (fd_it != msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
|
@ -122,7 +122,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||||
{
|
{
|
||||||
if (log_level > 2)
|
if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf("[PG %u/%u] continuing write %lx to object %lx:%lx after flush\n",
|
printf("[PG %u/%u] continuing write %jx to object %jx:%jx after flush\n",
|
||||||
pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
|
pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
|
||||||
}
|
}
|
||||||
continue_ops.push_back(wr_it->second);
|
continue_ops.push_back(wr_it->second);
|
||||||
|
@ -169,12 +169,12 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||||
if (log_level > 2)
|
if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] flush batch %lx on OSD %lu: %s objects: ",
|
"[PG %u/%u] flush batch %jx on OSD %ju: %s objects: ",
|
||||||
pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
|
pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
|
||||||
);
|
);
|
||||||
for (int i = 0; i < count; i++)
|
for (int i = 0; i < count; i++)
|
||||||
{
|
{
|
||||||
printf(i > 0 ? ", %lx:%lx v%lu" : "%lx:%lx v%lu", data[i].oid.inode, data[i].oid.stripe, data[i].version);
|
printf(i > 0 ? ", %jx:%jx v%ju" : "%jx:%jx v%ju", data[i].oid.inode, data[i].oid.stripe, data[i].version);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -305,7 +305,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||||
};
|
};
|
||||||
if (log_level > 2)
|
if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf("Submitting recovery operation for %lx:%lx (%s)\n", op->oid.inode, op->oid.stripe, op->degraded ? "degraded" : "misplaced");
|
printf("Submitting recovery operation for %jx:%jx (%s)\n", op->oid.inode, op->oid.stripe, op->degraded ? "degraded" : "misplaced");
|
||||||
}
|
}
|
||||||
op->osd_op->peer_fd = -1;
|
op->osd_op->peer_fd = -1;
|
||||||
op->osd_op->callback = [this, op](osd_op_t *osd_op)
|
op->osd_op->callback = [this, op](osd_op_t *osd_op)
|
||||||
|
@ -315,7 +315,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||||
// Error recovering object
|
// Error recovering object
|
||||||
// EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
|
// EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Recovery operation failed with object %lx:%lx: error %ld\n",
|
"[PG %u/%u] Recovery operation failed with object %jx:%jx: error %jd\n",
|
||||||
INODE_POOL(op->oid.inode),
|
INODE_POOL(op->oid.inode),
|
||||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
||||||
op->oid.inode, op->oid.stripe, osd_op->reply.hdr.retval
|
op->oid.inode, op->oid.stripe, osd_op->reply.hdr.retval
|
||||||
|
@ -323,7 +323,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||||
}
|
}
|
||||||
else if (log_level > 2)
|
else if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
printf("Recovery operation done for %jx:%jx\n", op->oid.inode, op->oid.stripe);
|
||||||
}
|
}
|
||||||
finish_recovery_op(op);
|
finish_recovery_op(op);
|
||||||
};
|
};
|
||||||
|
@ -422,6 +422,10 @@ void osd_t::tune_recovery()
|
||||||
rtune_avg_lat = total_recovery_usec/recovery_count;
|
rtune_avg_lat = total_recovery_usec/recovery_count;
|
||||||
uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util;
|
uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util;
|
||||||
auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
|
auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
|
||||||
|
if (sleep_us > recovery_tune_sleep_cutoff_us)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (recovery_target_sleep_items.size() != recovery_tune_agg_interval)
|
if (recovery_target_sleep_items.size() != recovery_tune_agg_interval)
|
||||||
{
|
{
|
||||||
recovery_target_sleep_items.resize(recovery_tune_agg_interval);
|
recovery_target_sleep_items.resize(recovery_tune_agg_interval);
|
||||||
|
@ -438,10 +442,10 @@ void osd_t::tune_recovery()
|
||||||
if (recovery_target_sleep_count < recovery_tune_agg_interval)
|
if (recovery_target_sleep_count < recovery_tune_agg_interval)
|
||||||
recovery_target_sleep_count++;
|
recovery_target_sleep_count++;
|
||||||
recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count;
|
recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count;
|
||||||
if (log_level > 4)
|
if (log_level > 1)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] auto-tune: client util: %.2f, recovery util: %.2f, lat: %lu us -> target util %.2f, delay %lu us\n",
|
"[OSD %ju] auto-tune: client util: %.2f, recovery util: %.2f, lat: %ju us -> target util %.2f, delay %ju us\n",
|
||||||
osd_num, rtune_client_util, total_recovery_usec/1000000.0/recovery_tune_interval,
|
osd_num, rtune_client_util, total_recovery_usec/1000000.0/recovery_tune_interval,
|
||||||
rtune_avg_lat, rtune_target_util, recovery_target_sleep_us
|
rtune_avg_lat, rtune_target_util, recovery_target_sleep_us
|
||||||
);
|
);
|
||||||
|
|
|
@ -113,7 +113,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||||
if (repeer)
|
if (repeer)
|
||||||
{
|
{
|
||||||
// Repeer this pg
|
// Repeer this pg
|
||||||
printf("[PG %u/%u] Repeer because of OSD %lu\n", pg.pool_id, pg.pg_num, peer_osd);
|
printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
|
||||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
|
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
|
||||||
{
|
{
|
||||||
start_pg_peering(pg);
|
start_pg_peering(pg);
|
||||||
|
@ -222,6 +222,9 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
}
|
}
|
||||||
if (pg.pg_cursize < pg.pg_minsize)
|
if (pg.pg_cursize < pg.pg_minsize)
|
||||||
{
|
{
|
||||||
|
// FIXME: Incomplete EC PGs may currently easily lead to write hangs ("slow ops" in OSD logs)
|
||||||
|
// because such PGs don't flush unstable entries on secondary OSDs so they can't remove these
|
||||||
|
// entries from their journals...
|
||||||
pg.state = PG_INCOMPLETE;
|
pg.state = PG_INCOMPLETE;
|
||||||
report_pg_state(pg);
|
report_pg_state(pg);
|
||||||
return;
|
return;
|
||||||
|
@ -344,7 +347,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||||
}
|
}
|
||||||
add_bs_subop_stats(op);
|
add_bs_subop_stats(op);
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
|
"[PG %u/%u] Got object list from OSD %ju (local): %d object versions (%ju of them stable)\n",
|
||||||
ps->pool_id, ps->pg_num, role_osd, bs_op->retval, bs_op->version
|
ps->pool_id, ps->pg_num, role_osd, bs_op->retval, bs_op->version
|
||||||
);
|
);
|
||||||
ps->list_results[role_osd] = {
|
ps->list_results[role_osd] = {
|
||||||
|
@ -384,7 +387,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||||
{
|
{
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
printf("Failed to get object list from OSD %ju (retval=%jd), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
||||||
int fail_fd = op->peer_fd;
|
int fail_fd = op->peer_fd;
|
||||||
ps->list_ops.erase(role_osd);
|
ps->list_ops.erase(role_osd);
|
||||||
delete op;
|
delete op;
|
||||||
|
@ -392,7 +395,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
|
"[PG %u/%u] Got object list from OSD %ju: %jd object versions (%ju of them stable)\n",
|
||||||
ps->pool_id, ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
|
ps->pool_id, ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
|
||||||
);
|
);
|
||||||
ps->list_results[role_osd] = {
|
ps->list_results[role_osd] = {
|
||||||
|
|
|
@ -239,7 +239,7 @@ void pg_obj_state_check_t::finish_object()
|
||||||
{
|
{
|
||||||
if (log_level > 1)
|
if (log_level > 1)
|
||||||
{
|
{
|
||||||
printf("Object is incomplete: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
printf("Object is incomplete: %jx:%jx version=%ju/%ju\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||||
}
|
}
|
||||||
state = OBJ_INCOMPLETE;
|
state = OBJ_INCOMPLETE;
|
||||||
pg->state = pg->state | PG_HAS_INCOMPLETE;
|
pg->state = pg->state | PG_HAS_INCOMPLETE;
|
||||||
|
@ -248,7 +248,7 @@ void pg_obj_state_check_t::finish_object()
|
||||||
{
|
{
|
||||||
if (log_level > 1)
|
if (log_level > 1)
|
||||||
{
|
{
|
||||||
printf("Object is degraded: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
printf("Object is degraded: %jx:%jx version=%ju/%ju\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||||
}
|
}
|
||||||
state = OBJ_DEGRADED;
|
state = OBJ_DEGRADED;
|
||||||
pg->state = pg->state | PG_HAS_DEGRADED;
|
pg->state = pg->state | PG_HAS_DEGRADED;
|
||||||
|
@ -257,7 +257,7 @@ void pg_obj_state_check_t::finish_object()
|
||||||
{
|
{
|
||||||
if (log_level > 2)
|
if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf("Object is misplaced: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
printf("Object is misplaced: %jx:%jx version=%ju/%ju\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||||
}
|
}
|
||||||
state |= OBJ_MISPLACED;
|
state |= OBJ_MISPLACED;
|
||||||
pg->state = pg->state | PG_HAS_MISPLACED;
|
pg->state = pg->state | PG_HAS_MISPLACED;
|
||||||
|
@ -267,7 +267,7 @@ void pg_obj_state_check_t::finish_object()
|
||||||
{
|
{
|
||||||
for (int i = obj_start; i < obj_end; i++)
|
for (int i = obj_start; i < obj_end; i++)
|
||||||
{
|
{
|
||||||
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num,
|
printf("v%ju present on: osd %ju, role %jd%s\n", list[i].version, list[i].osd_num,
|
||||||
(list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
(list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -445,7 +445,7 @@ void pg_t::calc_object_states(int log_level)
|
||||||
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+std::to_string(osd_num);
|
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+std::to_string(osd_num);
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] %lu clean objects on target OSD set %s\n",
|
"[PG %u/%u] %ju clean objects on target OSD set %s\n",
|
||||||
pool_id, pg_num, clean_count, osd_set_desc.c_str()
|
pool_id, pg_num, clean_count, osd_set_desc.c_str()
|
||||||
);
|
);
|
||||||
for (auto & stp: state_dict)
|
for (auto & stp: state_dict)
|
||||||
|
@ -460,7 +460,7 @@ void pg_t::calc_object_states(int log_level)
|
||||||
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
|
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
|
||||||
(loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
|
(loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
|
||||||
}
|
}
|
||||||
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
|
printf("[PG %u/%u] %ju objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -468,7 +468,7 @@ void pg_t::calc_object_states(int log_level)
|
||||||
void pg_t::print_state()
|
void pg_t::print_state()
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%ju objects)\n", pool_id, pg_num,
|
||||||
(state & PG_STARTING) ? "starting" : "",
|
(state & PG_STARTING) ? "starting" : "",
|
||||||
(state & PG_OFFLINE) ? "offline" : "",
|
(state & PG_OFFLINE) ? "offline" : "",
|
||||||
(state & PG_PEERING) ? "peering" : "",
|
(state & PG_PEERING) ? "peering" : "",
|
||||||
|
|
|
@ -49,10 +49,10 @@ int main(int argc, char *argv[])
|
||||||
pg.peering_state->list_results[osd_num] = r;
|
pg.peering_state->list_results[osd_num] = r;
|
||||||
}
|
}
|
||||||
pg.calc_object_states(0);
|
pg.calc_object_states(0);
|
||||||
printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
|
printf("deviation variants=%jd clean=%ju\n", pg.state_dict.size(), pg.clean_count);
|
||||||
for (auto it: pg.state_dict)
|
for (auto it: pg.state_dict)
|
||||||
{
|
{
|
||||||
printf("dev: state=%lx\n", it.second.state);
|
printf("dev: state=%jx\n", it.second.state);
|
||||||
}
|
}
|
||||||
delete pg.peering_state;
|
delete pg.peering_state;
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -473,7 +473,7 @@ pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, con
|
||||||
}
|
}
|
||||||
if (this->log_level >= log_at_level)
|
if (this->log_level >= log_at_level)
|
||||||
{
|
{
|
||||||
printf("Marking object %lx:%lx ", oid.inode, oid.stripe);
|
printf("Marking object %jx:%jx ", oid.inode, oid.stripe);
|
||||||
for (int i = 0, j = 0; i < object_state_bit_count; i++)
|
for (int i = 0, j = 0; i < object_state_bit_count; i++)
|
||||||
{
|
{
|
||||||
if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
|
if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
|
||||||
|
@ -483,31 +483,31 @@ pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, con
|
||||||
}
|
}
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
printf(": %lu copies available", n_copies);
|
printf(": %ju copies available", n_copies);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf(": %lu parts / %lu copies available", n_roles, n_copies);
|
printf(": %ju parts / %ju copies available", n_roles, n_copies);
|
||||||
}
|
}
|
||||||
if (n_invalid > 0)
|
if (n_invalid > 0)
|
||||||
{
|
{
|
||||||
printf(", %lu invalid", n_invalid);
|
printf(", %ju invalid", n_invalid);
|
||||||
}
|
}
|
||||||
if (n_outdated > 0)
|
if (n_outdated > 0)
|
||||||
{
|
{
|
||||||
printf(", %lu outdated", n_outdated);
|
printf(", %ju outdated", n_outdated);
|
||||||
}
|
}
|
||||||
if (n_misplaced > 0)
|
if (n_misplaced > 0)
|
||||||
{
|
{
|
||||||
printf(", %lu misplaced", n_misplaced);
|
printf(", %ju misplaced", n_misplaced);
|
||||||
}
|
}
|
||||||
if (n_corrupted > 0)
|
if (n_corrupted > 0)
|
||||||
{
|
{
|
||||||
printf(", %lu corrupted", n_corrupted);
|
printf(", %ju corrupted", n_corrupted);
|
||||||
}
|
}
|
||||||
if (n_inconsistent > 0)
|
if (n_inconsistent > 0)
|
||||||
{
|
{
|
||||||
printf(", %lu inconsistent", n_inconsistent);
|
printf(", %ju inconsistent", n_inconsistent);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ void osd_t::autosync()
|
||||||
{
|
{
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
printf("Warning: automatic sync resulted in an error: %ld (%s)\n", -op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
|
printf("Warning: automatic sync resulted in an error: %jd (%s)\n", -op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
|
||||||
}
|
}
|
||||||
delete autosync_op;
|
delete autosync_op;
|
||||||
autosync_op = NULL;
|
autosync_op = NULL;
|
||||||
|
@ -197,7 +197,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||||
});
|
});
|
||||||
#ifdef OSD_DEBUG
|
#ifdef OSD_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"Submit %s to local: %lx:%lx v%lu %u-%u\n", wr ? "write" : "read",
|
"Submit %s to local: %jx:%jx v%ju %u-%u\n", wr ? "write" : "read",
|
||||||
inode, op_data->oid.stripe | stripe_num, op_version,
|
inode, op_data->oid.stripe | stripe_num, op_version,
|
||||||
subop->bs_op->offset, subop->bs_op->len
|
subop->bs_op->offset, subop->bs_op->len
|
||||||
);
|
);
|
||||||
|
@ -225,7 +225,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||||
};
|
};
|
||||||
#ifdef OSD_DEBUG
|
#ifdef OSD_DEBUG
|
||||||
printf(
|
printf(
|
||||||
"Submit %s to osd %lu: %lx:%lx v%lu %u-%u\n", wr ? "write" : "read", role_osd_num,
|
"Submit %s to osd %ju: %jx:%jx v%ju %u-%u\n", wr ? "write" : "read", role_osd_num,
|
||||||
inode, op_data->oid.stripe | stripe_num, op_version,
|
inode, op_data->oid.stripe | stripe_num, op_version,
|
||||||
subop->req.sec_rw.offset, subop->req.sec_rw.len
|
subop->req.sec_rw.offset, subop->req.sec_rw.len
|
||||||
);
|
);
|
||||||
|
@ -369,14 +369,14 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||||
#ifdef OSD_DEBUG
|
#ifdef OSD_DEBUG
|
||||||
uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
|
uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
|
||||||
? msgr.clients[subop->peer_fd]->osd_num : osd_num;
|
? msgr.clients[subop->peer_fd]->osd_num : osd_num;
|
||||||
printf("subop %s %lx:%lx from osd %lu: version = %lu\n", osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, peer_osd, version);
|
printf("subop %s %jx:%jx from osd %ju: version = %ju\n", osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, peer_osd, version);
|
||||||
#endif
|
#endif
|
||||||
if (op_data->fact_ver != UINT64_MAX)
|
if (op_data->fact_ver != UINT64_MAX)
|
||||||
{
|
{
|
||||||
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
|
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "different fact_versions returned from %s subops: %lu vs %lu\n",
|
stderr, "different fact_versions returned from %s subops: %ju vs %ju\n",
|
||||||
osd_op_names[opcode], version, op_data->fact_ver
|
osd_op_names[opcode], version, op_data->fact_ver
|
||||||
);
|
);
|
||||||
retval = -ERANGE;
|
retval = -ERANGE;
|
||||||
|
@ -391,8 +391,8 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
subop->peer_fd >= 0
|
subop->peer_fd >= 0
|
||||||
? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
|
? "%1$s subop to %2$jx:%3$jx v%4$ju failed on peer %7$d: retval = %5$d (expected %6$d)\n"
|
||||||
: "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
|
: "%1$s subop to %2$jx:%3$jx v%4$ju failed locally: retval = %5$d (expected %6$d)\n",
|
||||||
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
|
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
|
||||||
retval, expected, subop->peer_fd
|
retval, expected, subop->peer_fd
|
||||||
);
|
);
|
||||||
|
|
|
@ -861,15 +861,15 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
||||||
static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
|
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
|
||||||
{
|
{
|
||||||
if (write_osd_set != read_osd_set)
|
if (write_osd_set != read_osd_set && end != 0)
|
||||||
{
|
{
|
||||||
for (int role = pg_minsize; role < pg_size; role++)
|
for (int role = pg_minsize; role < pg_size; role++)
|
||||||
{
|
{
|
||||||
if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
|
if (write_osd_set[role] != read_osd_set[role] && write_osd_set[role] != 0 && (start != 0 || end != chunk_size))
|
||||||
{
|
{
|
||||||
// Copy new parity into the read buffer to write it back
|
// Copy new parity into the read buffer to write it back
|
||||||
memcpy(
|
memcpy(
|
||||||
(uint8_t*)stripes[role].read_buf + start,
|
(uint8_t*)stripes[role].read_buf + start - stripes[role].read_start,
|
||||||
stripes[role].write_buf,
|
stripes[role].write_buf,
|
||||||
end - start
|
end - start
|
||||||
);
|
);
|
||||||
|
@ -885,7 +885,7 @@ static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size,
|
||||||
{
|
{
|
||||||
auto & s = stripes[role];
|
auto & s = stripes[role];
|
||||||
printf(
|
printf(
|
||||||
"Tr=%lu Tw=%lu Q=%x-%x R=%x-%x W=%x-%x Rb=%lx Wb=%lx\n",
|
"Tr=%ju Tw=%ju Q=%x-%x R=%x-%x W=%x-%x Rb=%jx Wb=%jx\n",
|
||||||
read_osd_set[role], write_osd_set[role],
|
read_osd_set[role], write_osd_set[role],
|
||||||
s.req_start, s.req_end,
|
s.req_start, s.req_end,
|
||||||
s.read_start, s.read_end,
|
s.read_start, s.read_end,
|
||||||
|
|
|
@ -30,6 +30,7 @@ void test16();
|
||||||
void test_recover_22_d2();
|
void test_recover_22_d2();
|
||||||
void test_ec43_error_bruteforce();
|
void test_ec43_error_bruteforce();
|
||||||
void test_recover_53_d5();
|
void test_recover_53_d5();
|
||||||
|
void test_recover_22();
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
|
@ -70,6 +71,8 @@ int main(int narg, char *args[])
|
||||||
test_ec43_error_bruteforce();
|
test_ec43_error_bruteforce();
|
||||||
// Test 19
|
// Test 19
|
||||||
test_recover_53_d5();
|
test_recover_53_d5();
|
||||||
|
// Test 20
|
||||||
|
test_recover_22();
|
||||||
// End
|
// End
|
||||||
printf("all ok\n");
|
printf("all ok\n");
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1244,3 +1247,99 @@ void test_recover_53_d5()
|
||||||
// Done
|
// Done
|
||||||
use_ec(8, 5, false);
|
use_ec(8, 5, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void test_recover_22()
|
||||||
|
{
|
||||||
|
const int bmp = 128*1024 / 4096 / 8;
|
||||||
|
use_ec(4, 2, true);
|
||||||
|
osd_num_t osd_set[4] = { 1, 2, 3, 4 };
|
||||||
|
osd_num_t write_osd_set[4] = { 5, 0, 3, 0 };
|
||||||
|
osd_rmw_stripe_t stripes[4] = {};
|
||||||
|
unsigned bitmaps[4] = { 0 };
|
||||||
|
// split
|
||||||
|
void *write_buf = (uint8_t*)malloc_or_die(4096);
|
||||||
|
set_pattern(write_buf, 4096, PATTERN0);
|
||||||
|
split_stripes(2, 128*1024, 120*1024, 4096, stripes);
|
||||||
|
assert(stripes[0].req_start == 120*1024 && stripes[0].req_end == 124*1024);
|
||||||
|
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
|
||||||
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
|
// calc_rmw
|
||||||
|
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 2, write_osd_set, 128*1024, bmp);
|
||||||
|
for (int i = 0; i < 4; i++)
|
||||||
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
|
assert(rmw_buf);
|
||||||
|
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
||||||
|
assert(stripes[1].read_start == 120*1024 && stripes[1].read_end == 124*1024);
|
||||||
|
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
|
||||||
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||||
|
assert(stripes[0].write_start == 120*1024 && stripes[0].write_end == 124*1024);
|
||||||
|
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||||
|
assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024);
|
||||||
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||||
|
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||||
|
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+132*1024);
|
||||||
|
assert(stripes[2].read_buf == NULL);
|
||||||
|
assert(stripes[3].read_buf == NULL);
|
||||||
|
assert(stripes[0].write_buf == write_buf);
|
||||||
|
assert(stripes[1].write_buf == NULL);
|
||||||
|
assert(stripes[2].write_buf == (uint8_t*)rmw_buf);
|
||||||
|
assert(stripes[3].write_buf == NULL);
|
||||||
|
// encode
|
||||||
|
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
|
||||||
|
set_pattern(stripes[1].read_buf, 4*1024, PATTERN2);
|
||||||
|
memset(stripes[0].bmp_buf, 0xff, bmp);
|
||||||
|
memset(stripes[1].bmp_buf, 0xff, bmp);
|
||||||
|
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, bmp);
|
||||||
|
assert(*(uint32_t*)stripes[2].bmp_buf == 0);
|
||||||
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
|
||||||
|
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||||
|
assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024);
|
||||||
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||||
|
assert(stripes[0].write_buf == stripes[0].read_buf);
|
||||||
|
assert(stripes[1].write_buf == NULL);
|
||||||
|
assert(stripes[2].write_buf == (uint8_t*)rmw_buf);
|
||||||
|
assert(stripes[3].write_buf == NULL);
|
||||||
|
check_pattern(stripes[2].write_buf, 4*1024, PATTERN0^PATTERN2);
|
||||||
|
// decode and verify
|
||||||
|
memset(stripes, 0, sizeof(stripes));
|
||||||
|
split_stripes(2, 128*1024, 0, 256*1024, stripes);
|
||||||
|
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
|
||||||
|
assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
|
||||||
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
|
for (int role = 0; role < 4; role++)
|
||||||
|
{
|
||||||
|
stripes[role].read_start = stripes[role].req_start;
|
||||||
|
stripes[role].read_end = stripes[role].req_end;
|
||||||
|
}
|
||||||
|
assert(extend_missing_stripes(stripes, write_osd_set, 2, 4) == 0);
|
||||||
|
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
||||||
|
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
|
||||||
|
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
|
||||||
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||||
|
void *read_buf = alloc_read_buffer(stripes, 4, 0);
|
||||||
|
for (int i = 0; i < 4; i++)
|
||||||
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
|
assert(read_buf);
|
||||||
|
assert(stripes[0].read_buf == read_buf);
|
||||||
|
assert(stripes[1].read_buf == (uint8_t*)read_buf+128*1024);
|
||||||
|
assert(stripes[2].read_buf == (uint8_t*)read_buf+2*128*1024);
|
||||||
|
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
|
||||||
|
set_pattern(stripes[0].read_buf+120*1024, 4*1024, PATTERN0);
|
||||||
|
set_pattern(stripes[2].read_buf, 128*1024, PATTERN1^PATTERN2);
|
||||||
|
set_pattern(stripes[2].read_buf+120*1024, 4*1024, PATTERN0^PATTERN2);
|
||||||
|
memset(stripes[0].bmp_buf, 0xff, bmp);
|
||||||
|
memset(stripes[2].bmp_buf, 0, bmp);
|
||||||
|
bitmaps[1] = 0;
|
||||||
|
bitmaps[3] = 0;
|
||||||
|
reconstruct_stripes_ec(stripes, 4, 2, bmp);
|
||||||
|
assert(bitmaps[0] == 0xFFFFFFFF);
|
||||||
|
assert(*(uint32_t*)stripes[1].bmp_buf == 0xFFFFFFFF);
|
||||||
|
check_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
|
||||||
|
free(read_buf);
|
||||||
|
// Done
|
||||||
|
free(rmw_buf);
|
||||||
|
free(write_buf);
|
||||||
|
use_ec(4, 2, false);
|
||||||
|
}
|
||||||
|
|
|
@ -82,7 +82,7 @@ void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oi
|
||||||
scrub_list_op = NULL;
|
scrub_list_op = NULL;
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
printf("Failed to get object list from OSD %ju (retval=%jd), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
||||||
int fail_fd = op->peer_fd;
|
int fail_fd = op->peer_fd;
|
||||||
delete op;
|
delete op;
|
||||||
msgr.stop_client(fail_fd);
|
msgr.stop_client(fail_fd);
|
||||||
|
@ -239,7 +239,7 @@ void osd_t::submit_scrub_op(object_id oid)
|
||||||
};
|
};
|
||||||
if (log_level > 2)
|
if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
|
printf("Submitting scrub for %jx:%jx\n", oid.inode, oid.stripe);
|
||||||
}
|
}
|
||||||
osd_op->callback = [this](osd_op_t *osd_op)
|
osd_op->callback = [this](osd_op_t *osd_op)
|
||||||
{
|
{
|
||||||
|
@ -248,7 +248,7 @@ void osd_t::submit_scrub_op(object_id oid)
|
||||||
{
|
{
|
||||||
// Scrub error
|
// Scrub error
|
||||||
printf(
|
printf(
|
||||||
"Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
|
"Scrub failed with object %jx:%jx (PG %u/%u): error %jd\n",
|
||||||
oid.inode, oid.stripe, INODE_POOL(oid.inode),
|
oid.inode, oid.stripe, INODE_POOL(oid.inode),
|
||||||
map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
|
map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
|
||||||
osd_op->reply.hdr.retval
|
osd_op->reply.hdr.retval
|
||||||
|
@ -256,7 +256,7 @@ void osd_t::submit_scrub_op(object_id oid)
|
||||||
}
|
}
|
||||||
else if (log_level > 2)
|
else if (log_level > 2)
|
||||||
{
|
{
|
||||||
printf("Scrubbed %lx:%lx\n", oid.inode, oid.stripe);
|
printf("Scrubbed %jx:%jx\n", oid.inode, oid.stripe);
|
||||||
}
|
}
|
||||||
delete osd_op;
|
delete osd_op;
|
||||||
if (scrub_sleep_ms)
|
if (scrub_sleep_ms)
|
||||||
|
@ -518,7 +518,7 @@ resume_2:
|
||||||
if (votes[role] > 0 && votes[role] < votes[best])
|
if (votes[role] > 0 && votes[role] < votes[best])
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies%s\n",
|
"[PG %u/%u] Object %jx:%jx v%ju copy on OSD %ju doesn't match %d other copies%s\n",
|
||||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
||||||
op_data->stripes[role].osd_num, votes[best],
|
op_data->stripes[role].osd_num, votes[best],
|
||||||
|
@ -541,7 +541,7 @@ resume_2:
|
||||||
best = -1;
|
best = -1;
|
||||||
inconsistent = true;
|
inconsistent = true;
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
|
"[PG %u/%u] Object %jx:%jx v%ju is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
|
||||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||||
);
|
);
|
||||||
|
@ -559,7 +559,7 @@ resume_2:
|
||||||
{
|
{
|
||||||
inconsistent = true;
|
inconsistent = true;
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
|
"[PG %u/%u] Object %jx:%jx v%ju is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
|
||||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||||
);
|
);
|
||||||
|
@ -584,7 +584,7 @@ resume_2:
|
||||||
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match other chunks%s\n",
|
"[PG %u/%u] Object %jx:%jx v%ju chunk %d on OSD %ju doesn't match other chunks%s\n",
|
||||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
||||||
role, op_data->stripes[role].osd_num,
|
role, op_data->stripes[role].osd_num,
|
||||||
|
@ -596,7 +596,7 @@ resume_2:
|
||||||
{
|
{
|
||||||
inconsistent = true;
|
inconsistent = true;
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Object %lx:%lx v%lu is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n",
|
"[PG %u/%u] Object %jx:%jx v%ju is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n",
|
||||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||||
);
|
);
|
||||||
|
|
|
@ -42,8 +42,10 @@ void osd_t::secondary_op_callback(osd_op_t *op)
|
||||||
int retval = op->bs_op->retval;
|
int retval = op->bs_op->retval;
|
||||||
delete op->bs_op;
|
delete op->bs_op;
|
||||||
op->bs_op = NULL;
|
op->bs_op = NULL;
|
||||||
if (op->is_recovery_related() && recovery_target_sleep_us)
|
if (op->is_recovery_related() && recovery_target_sleep_us &&
|
||||||
|
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE)
|
||||||
{
|
{
|
||||||
|
// Apply pause AFTER commit. Do not apply pause to SYNC at all
|
||||||
if (!op->tv_end.tv_sec)
|
if (!op->tv_end.tv_sec)
|
||||||
{
|
{
|
||||||
clock_gettime(CLOCK_REALTIME, &op->tv_end);
|
clock_gettime(CLOCK_REALTIME, &op->tv_end);
|
||||||
|
@ -59,7 +61,25 @@ void osd_t::secondary_op_callback(osd_op_t *op)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::exec_secondary(osd_op_t *cur_op)
|
void osd_t::exec_secondary(osd_op_t *op)
|
||||||
|
{
|
||||||
|
if (op->is_recovery_related() && recovery_target_sleep_us &&
|
||||||
|
op->req.hdr.opcode != OSD_OP_SEC_STABILIZE && op->req.hdr.opcode != OSD_OP_SEC_SYNC)
|
||||||
|
{
|
||||||
|
// Apply pause BEFORE write/delete
|
||||||
|
tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id)
|
||||||
|
{
|
||||||
|
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||||
|
exec_secondary_real(op);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
exec_secondary_real(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||||
{
|
{
|
||||||
|
|
|
@ -174,7 +174,7 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
|
||||||
}
|
}
|
||||||
if (expected >= 0 && reply.hdr.retval != expected)
|
if (expected >= 0 && reply.hdr.retval != expected)
|
||||||
{
|
{
|
||||||
printf("operation failed, retval=%ld\n", reply.hdr.retval);
|
printf("operation failed, retval=%jd\n", reply.hdr.retval);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
@ -210,7 +210,7 @@ uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ver
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
free(data);
|
free(data);
|
||||||
printf("Read %lx:%lx v%lu = v%lu\n", inode, stripe, version, reply.sec_rw.version);
|
printf("Read %jx:%jx v%ju = v%ju\n", inode, stripe, version, reply.sec_rw.version);
|
||||||
op.hdr.opcode = OSD_OP_SEC_LIST;
|
op.hdr.opcode = OSD_OP_SEC_LIST;
|
||||||
op.sec_list.list_pg = 1;
|
op.sec_list.list_pg = 1;
|
||||||
op.sec_list.pg_count = 1;
|
op.sec_list.pg_count = 1;
|
||||||
|
@ -234,7 +234,7 @@ uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ver
|
||||||
{
|
{
|
||||||
if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
|
if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
|
||||||
{
|
{
|
||||||
printf("list: %lx:%lx v%lu stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
|
printf("list: %jx:%jx v%ju stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -35,9 +35,9 @@ static uint64_t sync_sum = 0, sync_count = 0;
|
||||||
|
|
||||||
void handle_sigint(int sig)
|
void handle_sigint(int sig)
|
||||||
{
|
{
|
||||||
printf("4k randread: %lu us avg\n", read_count ? read_sum/read_count : 0);
|
printf("4k randread: %ju us avg\n", read_count ? read_sum/read_count : 0);
|
||||||
printf("4k randwrite: %lu us avg\n", write_count ? write_sum/write_count : 0);
|
printf("4k randwrite: %ju us avg\n", write_count ? write_sum/write_count : 0);
|
||||||
printf("sync: %lu us avg\n", sync_count ? sync_sum/sync_count : 0);
|
printf("sync: %ju us avg\n", sync_count ? sync_sum/sync_count : 0);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,7 +106,7 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
|
||||||
}
|
}
|
||||||
if (reply.hdr.retval != expected)
|
if (reply.hdr.retval != expected)
|
||||||
{
|
{
|
||||||
printf("operation failed, retval=%ld (%s)\n", reply.hdr.retval, strerror(-reply.hdr.retval));
|
printf("operation failed, retval=%jd (%s)\n", reply.hdr.retval, strerror(-reply.hdr.retval));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue