Compare commits
47 Commits
Author | SHA1 | Date |
---|---|---|
|
4afb617f59 | |
|
d3fde0569f | |
|
438b64f6c3 | |
|
2b0a802ea1 | |
|
0dd49c1d67 | |
|
410170db96 | |
|
7d8523e0e5 | |
|
db915184c6 | |
|
5ae6fea49c | |
|
95ec750b8c | |
|
90b1de307b | |
|
7e6a95c678 | |
|
b2416afb28 | |
|
66dc116f60 | |
|
0cb8629ab6 | |
|
b7322a405a | |
|
5692630005 | |
|
00ced7cea7 | |
|
ebdb75e287 | |
|
f397fe9c6a | |
|
28560b4ae5 | |
|
2d07449e74 | |
|
80c4e8c20f | |
|
2ab0ae3bc9 | |
|
05e59c1b4f | |
|
e6e1c5b962 | |
|
9556eeae45 | |
|
96b5a72630 | |
|
ef80f121f6 | |
|
bbdd1f3aa7 | |
|
5dd37f519a | |
|
a2278be84d | |
|
1393a2671c | |
|
9fa8ae5384 | |
|
169a35a067 | |
|
2b2a10581d | |
|
10fd51862a | |
|
15d0204f96 | |
|
21d6e88a1b | |
|
df2847df2d | |
|
327c98a4b6 | |
|
3cc0abfd81 | |
|
80e5f8ba76 | |
|
4b660f1ce8 | |
|
dfde0e60f0 | |
|
013f688ffe | |
|
cf9738ddbe |
.gitea/workflows
debian
docker
etc
apt/sources.list.d
vitastor
docs
installation
node-binding
patches
src
disk_tool
|
@ -684,6 +684,24 @@ jobs:
|
||||||
echo ""
|
echo ""
|
||||||
done
|
done
|
||||||
|
|
||||||
|
test_write_iothreads:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: build
|
||||||
|
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||||
|
steps:
|
||||||
|
- name: Run test
|
||||||
|
id: test
|
||||||
|
timeout-minutes: 3
|
||||||
|
run: TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' /root/vitastor/tests/test_write.sh
|
||||||
|
- name: Print logs
|
||||||
|
if: always() && steps.test.outcome == 'failure'
|
||||||
|
run: |
|
||||||
|
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||||
|
echo "-------- $i --------"
|
||||||
|
cat $i
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
test_write_no_same:
|
test_write_no_same:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: build
|
needs: build
|
||||||
|
@ -720,6 +738,24 @@ jobs:
|
||||||
echo ""
|
echo ""
|
||||||
done
|
done
|
||||||
|
|
||||||
|
test_heal_local_read:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: build
|
||||||
|
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||||
|
steps:
|
||||||
|
- name: Run test
|
||||||
|
id: test
|
||||||
|
timeout-minutes: 10
|
||||||
|
run: TEST_NAME=local_read POOLCFG='"local_reads":"random",' /root/vitastor/tests/test_heal.sh
|
||||||
|
- name: Print logs
|
||||||
|
if: always() && steps.test.outcome == 'failure'
|
||||||
|
run: |
|
||||||
|
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||||
|
echo "-------- $i --------"
|
||||||
|
cat $i
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
test_heal_ec:
|
test_heal_ec:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: build
|
needs: build
|
||||||
|
|
|
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VITASTOR_VERSION "2.1.0")
|
set(VITASTOR_VERSION "2.2.0")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
VITASTOR_VERSION ?= v2.1.0
|
VITASTOR_VERSION ?= v2.2.0
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ spec:
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v2.1.0
|
image: vitalif/vitastor-csi:v2.2.0
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -121,7 +121,7 @@ spec:
|
||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v2.1.0
|
image: vitalif/vitastor-csi:v2.2.0
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -5,7 +5,7 @@ package vitastor
|
||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "2.1.0"
|
vitastorCSIDriverVersion = "2.2.0"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
vitastor (2.1.0-1) unstable; urgency=medium
|
vitastor (2.2.0-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
VITASTOR_VERSION ?= v2.1.0
|
VITASTOR_VERSION ?= v2.2.0
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
build:
|
build:
|
||||||
@docker build --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
|
@docker build --no-cache --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
|
||||||
|
|
||||||
push:
|
push:
|
||||||
@docker push vitalif/vitastor:$(VITASTOR_VERSION)
|
@docker push vitalif/vitastor:$(VITASTOR_VERSION)
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
deb http://vitastor.io/debian bookworm main
|
deb http://vitastor.io/debian bookworm main
|
||||||
|
deb http://http.debian.net/debian/ bookworm-backports main
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# Desired Vitastor version
|
# Desired Vitastor version
|
||||||
VITASTOR_VERSION=v2.1.0
|
VITASTOR_VERSION=v2.2.0
|
||||||
|
|
||||||
# Additional arguments for all containers
|
# Additional arguments for all containers
|
||||||
# For example, you may want to specify a custom logging driver here
|
# For example, you may want to specify a custom logging driver here
|
||||||
|
|
|
@ -24,6 +24,7 @@ affect their interaction with the cluster.
|
||||||
- [nbd_max_devices](#nbd_max_devices)
|
- [nbd_max_devices](#nbd_max_devices)
|
||||||
- [nbd_max_part](#nbd_max_part)
|
- [nbd_max_part](#nbd_max_part)
|
||||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||||
|
- [hostname](#hostname)
|
||||||
|
|
||||||
## client_iothread_count
|
## client_iothread_count
|
||||||
|
|
||||||
|
@ -215,3 +216,12 @@ just one OSD becomes 100 % full!
|
||||||
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
|
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
|
||||||
unable to start at all), so you'll be able to recover from "out of space" errors
|
unable to start at all), so you'll be able to recover from "out of space" errors
|
||||||
without destroying and recreating OSDs.
|
without destroying and recreating OSDs.
|
||||||
|
|
||||||
|
## hostname
|
||||||
|
|
||||||
|
- Type: string
|
||||||
|
- Can be changed online: yes
|
||||||
|
|
||||||
|
Clients use host name to find their distance to OSDs when [localized reads](pool.en.md#local_reads)
|
||||||
|
are enabled. By default, standard [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html)
|
||||||
|
function is used to determine host name, but you can also override it with this parameter.
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
- [nbd_max_devices](#nbd_max_devices)
|
- [nbd_max_devices](#nbd_max_devices)
|
||||||
- [nbd_max_part](#nbd_max_part)
|
- [nbd_max_part](#nbd_max_part)
|
||||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||||
|
- [hostname](#hostname)
|
||||||
|
|
||||||
## client_iothread_count
|
## client_iothread_count
|
||||||
|
|
||||||
|
@ -219,3 +220,13 @@ RDMA и хотите повысить пиковую производитель
|
||||||
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
|
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
|
||||||
восстановить работу кластера после ошибок отсутствия свободного места
|
восстановить работу кластера после ошибок отсутствия свободного места
|
||||||
без уничтожения и пересоздания OSD.
|
без уничтожения и пересоздания OSD.
|
||||||
|
|
||||||
|
## hostname
|
||||||
|
|
||||||
|
- Тип: строка
|
||||||
|
- Можно менять на лету: да
|
||||||
|
|
||||||
|
Клиенты используют имя хоста для определения расстояния до OSD, когда включены
|
||||||
|
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
|
||||||
|
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
|
||||||
|
но вы также можете задать имя хоста вручную данным параметром.
|
||||||
|
|
|
@ -34,6 +34,7 @@ between clients, OSDs and etcd.
|
||||||
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
||||||
- [etcd_min_reload_interval](#etcd_min_reload_interval)
|
- [etcd_min_reload_interval](#etcd_min_reload_interval)
|
||||||
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
||||||
|
- [min_zerocopy_send_size](#min_zerocopy_send_size)
|
||||||
- [use_sync_send_recv](#use_sync_send_recv)
|
- [use_sync_send_recv](#use_sync_send_recv)
|
||||||
|
|
||||||
## osd_network
|
## osd_network
|
||||||
|
@ -313,6 +314,34 @@ is received without an additional copy. You can try to play with this
|
||||||
parameter and see how it affects random iops and linear bandwidth if you
|
parameter and see how it affects random iops and linear bandwidth if you
|
||||||
want.
|
want.
|
||||||
|
|
||||||
|
## min_zerocopy_send_size
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 32768
|
||||||
|
|
||||||
|
OSDs and clients will attempt to use io_uring-based zero-copy TCP send
|
||||||
|
for buffers larger than this number of bytes. Zero-copy send with io_uring is
|
||||||
|
supported since Linux kernel version 6.1. Support is auto-detected and disabled
|
||||||
|
automatically when not available. It can also be disabled explicitly by setting
|
||||||
|
this parameter to a negative value.
|
||||||
|
|
||||||
|
⚠️ Warning! Zero-copy send performance may vary greatly from CPU to CPU and from
|
||||||
|
one kernel version to another. Generally, it tends to only make benefit with larger
|
||||||
|
messages. With smaller messages (say, 4 KB), it may actually be slower. 32 KB is
|
||||||
|
enough for almost all CPUs, but even smaller values are optimal for some of them.
|
||||||
|
For example, 4 KB is OK for EPYC Milan/Genoa and 12 KB is OK for Xeon Ice Lake
|
||||||
|
(but verify it yourself please).
|
||||||
|
|
||||||
|
Verification instructions:
|
||||||
|
1. Add `iommu=pt` into your Linux kernel command line and reboot.
|
||||||
|
2. Upgrade your kernel. For example, it's very important to use 6.11+ with recent AMD EPYCs.
|
||||||
|
3. Run some tests with the [send-zerocopy liburing example](https://github.com/axboe/liburing/blob/master/examples/send-zerocopy.c)
|
||||||
|
to find the minimal message size for which zero-copy is optimal.
|
||||||
|
Use `./send-zerocopy tcp -4 -R` at the server side and
|
||||||
|
`time ./send-zerocopy tcp -4 -b 0 -s BUFFER_SIZE -D SERVER_IP` at the client side with
|
||||||
|
`-z 0` (no zero-copy) and `-z 1` (zero-copy), and compare MB/s and used CPU time
|
||||||
|
(user+system).
|
||||||
|
|
||||||
## use_sync_send_recv
|
## use_sync_send_recv
|
||||||
|
|
||||||
- Type: boolean
|
- Type: boolean
|
||||||
|
|
|
@ -34,6 +34,7 @@
|
||||||
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
||||||
- [etcd_min_reload_interval](#etcd_min_reload_interval)
|
- [etcd_min_reload_interval](#etcd_min_reload_interval)
|
||||||
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
- [tcp_header_buffer_size](#tcp_header_buffer_size)
|
||||||
|
- [min_zerocopy_send_size](#min_zerocopy_send_size)
|
||||||
- [use_sync_send_recv](#use_sync_send_recv)
|
- [use_sync_send_recv](#use_sync_send_recv)
|
||||||
|
|
||||||
## osd_network
|
## osd_network
|
||||||
|
@ -321,6 +322,34 @@ Vitastor содержат 128-байтные заголовки, за котор
|
||||||
поменять этот параметр и посмотреть, как он влияет на производительность
|
поменять этот параметр и посмотреть, как он влияет на производительность
|
||||||
случайного и линейного доступа.
|
случайного и линейного доступа.
|
||||||
|
|
||||||
|
## min_zerocopy_send_size
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 32768
|
||||||
|
|
||||||
|
OSD и клиенты будут пробовать использовать TCP-отправку без копирования (zero-copy) на
|
||||||
|
основе io_uring для буферов, больших, чем это число байт. Отправка без копирования
|
||||||
|
поддерживается в io_uring, начиная с версии ядра Linux 6.1. Наличие поддержки
|
||||||
|
проверяется автоматически и zero-copy отключается, когда поддержки нет. Также
|
||||||
|
её можно отключить явно, установив данный параметр в отрицательное значение.
|
||||||
|
|
||||||
|
⚠️ Внимание! Производительность данной функции может сильно отличаться на разных
|
||||||
|
процессорах и на разных версиях ядра Linux. В целом, zero-copy обычно быстрее с
|
||||||
|
большими сообщениями, а с мелкими (например, 4 КБ) zero-copy может быть даже
|
||||||
|
медленнее. 32 КБ достаточно почти для всех процессоров, но для каких-то можно
|
||||||
|
использовать даже меньшие значения. Например, для EPYC Milan/Genoa подходит 4 КБ,
|
||||||
|
а для Xeon Ice Lake - 12 КБ (но, пожалуйста, перепроверьте это сами).
|
||||||
|
|
||||||
|
Инструкция по проверке:
|
||||||
|
1. Добавьте `iommu=pt` в командную строку загрузки вашего ядра Linux и перезагрузитесь.
|
||||||
|
2. Обновите ядро. Например, для AMD EPYC очень важно использовать версию 6.11+.
|
||||||
|
3. Позапускайте тесты с помощью [send-zerocopy из примеров liburing](https://github.com/axboe/liburing/blob/master/examples/send-zerocopy.c),
|
||||||
|
чтобы найти минимальный размер сообщения, для которого zero-copy отправка оптимальна.
|
||||||
|
Запускайте `./send-zerocopy tcp -4 -R` на стороне сервера и
|
||||||
|
`time ./send-zerocopy tcp -4 -b 0 -s РАЗМЕР_БУФЕРА -D АДРЕС_СЕРВЕРА` на стороне клиента
|
||||||
|
с опцией `-z 0` (обычная отправка) и `-z 1` (отправка без копирования), и сравнивайте
|
||||||
|
скорость в МБ/с и занятое процессорное время (user+system).
|
||||||
|
|
||||||
## use_sync_send_recv
|
## use_sync_send_recv
|
||||||
|
|
||||||
- Тип: булево (да/нет)
|
- Тип: булево (да/нет)
|
||||||
|
|
|
@ -63,6 +63,8 @@ with an OSD restart or, for some of them, even without restarting by updating co
|
||||||
- [discard_on_start](#discard_on_start)
|
- [discard_on_start](#discard_on_start)
|
||||||
- [min_discard_size](#min_discard_size)
|
- [min_discard_size](#min_discard_size)
|
||||||
- [allow_net_split](#allow_net_split)
|
- [allow_net_split](#allow_net_split)
|
||||||
|
- [enable_pg_locks](#enable_pg_locks)
|
||||||
|
- [pg_lock_retry_interval_ms](#pg_lock_retry_interval_ms)
|
||||||
|
|
||||||
## bind_address
|
## bind_address
|
||||||
|
|
||||||
|
@ -647,3 +649,20 @@ The downside is that it increases the probability of writing data into just pg_m
|
||||||
OSDs during failover which can lead to PGs becoming incomplete after additional outages.
|
OSDs during failover which can lead to PGs becoming incomplete after additional outages.
|
||||||
|
|
||||||
The old behaviour in versions up to 2.0.0 was equal to enabled allow_net_split.
|
The old behaviour in versions up to 2.0.0 was equal to enabled allow_net_split.
|
||||||
|
|
||||||
|
## enable_pg_locks
|
||||||
|
|
||||||
|
- Type: boolean
|
||||||
|
|
||||||
|
Vitastor 2.2.0 introduces a new layer of split-brain prevention mechanism in
|
||||||
|
addition to etcd: PG locks. They prevent split-brain even in abnormal theoretical cases
|
||||||
|
when etcd is extremely laggy. As a new feature, by default, PG locks are only enabled
|
||||||
|
for pools where they're required - pools with [localized reads](pool.en.md#local_reads).
|
||||||
|
Use this parameter to enable or disable this function for all pools.
|
||||||
|
|
||||||
|
## pg_lock_retry_interval_ms
|
||||||
|
|
||||||
|
- Type: milliseconds
|
||||||
|
- Default: 100
|
||||||
|
|
||||||
|
Retry interval for failed PG lock attempts.
|
||||||
|
|
|
@ -64,6 +64,8 @@
|
||||||
- [discard_on_start](#discard_on_start)
|
- [discard_on_start](#discard_on_start)
|
||||||
- [min_discard_size](#min_discard_size)
|
- [min_discard_size](#min_discard_size)
|
||||||
- [allow_net_split](#allow_net_split)
|
- [allow_net_split](#allow_net_split)
|
||||||
|
- [enable_pg_locks](#enable_pg_locks)
|
||||||
|
- [pg_lock_retry_interval_ms](#pg_lock_retry_interval_ms)
|
||||||
|
|
||||||
## bind_address
|
## bind_address
|
||||||
|
|
||||||
|
@ -679,3 +681,21 @@ pg_minsize OSD во время переключений, что может по
|
||||||
неполными (incomplete), если упадут ещё какие-то OSD.
|
неполными (incomplete), если упадут ещё какие-то OSD.
|
||||||
|
|
||||||
Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
|
Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
|
||||||
|
|
||||||
|
## enable_pg_locks
|
||||||
|
|
||||||
|
- Тип: булево (да/нет)
|
||||||
|
|
||||||
|
В Vitastor 2.2.0 появился новый слой защиты от сплитбрейна в дополнение к etcd -
|
||||||
|
блокировки PG. Они гарантируют порядок даже в теоретических ненормальных случаях,
|
||||||
|
когда etcd очень сильно тормозит. Так как функция новая, по умолчанию она включается
|
||||||
|
только для пулов, в которых она необходима - а именно, в пулах с включёнными
|
||||||
|
[локальными чтениями](pool.ru.md#local_reads). Ну а с помощью данного параметра
|
||||||
|
можно включить блокировки PG для всех пулов.
|
||||||
|
|
||||||
|
## pg_lock_retry_interval_ms
|
||||||
|
|
||||||
|
- Тип: миллисекунды
|
||||||
|
- Значение по умолчанию: 100
|
||||||
|
|
||||||
|
Интервал повтора неудачных попыток блокировки PG.
|
||||||
|
|
|
@ -34,6 +34,7 @@ Parameters:
|
||||||
- [failure_domain](#failure_domain)
|
- [failure_domain](#failure_domain)
|
||||||
- [level_placement](#level_placement)
|
- [level_placement](#level_placement)
|
||||||
- [raw_placement](#raw_placement)
|
- [raw_placement](#raw_placement)
|
||||||
|
- [local_reads](#local_reads)
|
||||||
- [max_osd_combinations](#max_osd_combinations)
|
- [max_osd_combinations](#max_osd_combinations)
|
||||||
- [block_size](#block_size)
|
- [block_size](#block_size)
|
||||||
- [bitmap_granularity](#bitmap_granularity)
|
- [bitmap_granularity](#bitmap_granularity)
|
||||||
|
@ -133,8 +134,8 @@ Pool name.
|
||||||
## scheme
|
## scheme
|
||||||
|
|
||||||
- Type: string
|
- Type: string
|
||||||
- Required
|
|
||||||
- One of: "replicated", "xor", "ec" or "jerasure"
|
- One of: "replicated", "xor", "ec" or "jerasure"
|
||||||
|
- Required
|
||||||
|
|
||||||
Redundancy scheme used for data in this pool. "jerasure" is an alias for "ec",
|
Redundancy scheme used for data in this pool. "jerasure" is an alias for "ec",
|
||||||
both use Reed-Solomon-Vandermonde codes based on ISA-L or jerasure libraries.
|
both use Reed-Solomon-Vandermonde codes based on ISA-L or jerasure libraries.
|
||||||
|
@ -289,6 +290,30 @@ Examples:
|
||||||
- EC 4+2 in 3 DC: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
- EC 4+2 in 3 DC: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
||||||
- 1 replica in fixed DC + 2 in random DCs: `dc?=meow, dc!=1, dc!=(1,2)`
|
- 1 replica in fixed DC + 2 in random DCs: `dc?=meow, dc!=1, dc!=(1,2)`
|
||||||
|
|
||||||
|
## local_reads
|
||||||
|
|
||||||
|
- Type: string
|
||||||
|
- One of: "primary", "nearest" or "random"
|
||||||
|
- Default: primary
|
||||||
|
|
||||||
|
By default, Vitastor serves all read and write requests from the primary OSD of each PG.
|
||||||
|
But it can also serve read requests for replicated pools from secondary OSDs in clean PGs
|
||||||
|
(active or active+left_on_dead) which may be useful if you have OSDs with different network
|
||||||
|
latency to the client - for example, if you have a cross-datacenter setup.
|
||||||
|
|
||||||
|
If you set this parameter to "nearest", clients will try to read from the nearest OSD
|
||||||
|
in the [Placement Tree](#placement-tree), i.e. from an OSD from the same host or datacenter.
|
||||||
|
Distance to different OSDs will be calculated based on client hostname, determined
|
||||||
|
automatically or set manually in the [hostname](client.en.md#hostname) parameter.
|
||||||
|
|
||||||
|
If you set this parameter to "random", clients will try to distribute read requests over
|
||||||
|
all available secondary OSDs. This mode is mainly useful for tests, but, probably, not
|
||||||
|
really required in production setups.
|
||||||
|
|
||||||
|
[PG locks](osd.en.md#enable_pg_locks) are required for local reads to function. However,
|
||||||
|
PG locks are enabled automatically by default for pools with enabled local reads, so you
|
||||||
|
don't have to enable them explicitly.
|
||||||
|
|
||||||
## max_osd_combinations
|
## max_osd_combinations
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
|
@ -324,7 +349,8 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
|
||||||
|
|
||||||
## immediate_commit
|
## immediate_commit
|
||||||
|
|
||||||
- Type: string, one of "all", "small" and "none"
|
- Type: string
|
||||||
|
- One of: "all", "small" or "none"
|
||||||
- Default: none
|
- Default: none
|
||||||
|
|
||||||
Immediate commit setting for this pool. The value from /vitastor/config/global
|
Immediate commit setting for this pool. The value from /vitastor/config/global
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
- [failure_domain](#failure_domain)
|
- [failure_domain](#failure_domain)
|
||||||
- [level_placement](#level_placement)
|
- [level_placement](#level_placement)
|
||||||
- [raw_placement](#raw_placement)
|
- [raw_placement](#raw_placement)
|
||||||
|
- [local_reads](#local_reads)
|
||||||
- [max_osd_combinations](#max_osd_combinations)
|
- [max_osd_combinations](#max_osd_combinations)
|
||||||
- [block_size](#block_size)
|
- [block_size](#block_size)
|
||||||
- [bitmap_granularity](#bitmap_granularity)
|
- [bitmap_granularity](#bitmap_granularity)
|
||||||
|
@ -133,8 +134,8 @@ OSD игнорируется и OSD не удаляется из распред
|
||||||
## scheme
|
## scheme
|
||||||
|
|
||||||
- Тип: строка
|
- Тип: строка
|
||||||
- Обязательный
|
|
||||||
- Возможные значения: "replicated", "xor", "ec" или "jerasure"
|
- Возможные значения: "replicated", "xor", "ec" или "jerasure"
|
||||||
|
- Обязательный
|
||||||
|
|
||||||
Схема избыточности, используемая в данном пуле. "jerasure" - синоним для "ec",
|
Схема избыточности, используемая в данном пуле. "jerasure" - синоним для "ec",
|
||||||
в обеих схемах используются коды Рида-Соломона-Вандермонда, реализованные на
|
в обеих схемах используются коды Рида-Соломона-Вандермонда, реализованные на
|
||||||
|
@ -287,6 +288,30 @@ meow недоступен".
|
||||||
- EC 4+2 в 3 датацентрах: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
- EC 4+2 в 3 датацентрах: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
||||||
- 1 копия в фиксированном ДЦ + 2 в других ДЦ: `dc?=meow, dc!=1, dc!=(1,2)`
|
- 1 копия в фиксированном ДЦ + 2 в других ДЦ: `dc?=meow, dc!=1, dc!=(1,2)`
|
||||||
|
|
||||||
|
## local_reads
|
||||||
|
|
||||||
|
- Тип: строка
|
||||||
|
- Возможные значения: "primary", "nearest" или "random"
|
||||||
|
- По умолчанию: primary
|
||||||
|
|
||||||
|
По умолчанию Vitastor обслуживает все запросы чтения и записи с первичного OSD каждой PG.
|
||||||
|
Однако, в чистых PG (active или active+left_on_dead) реплицированных пулов также есть
|
||||||
|
возможность обслуживать запросы чтения с вторичных OSD, что может быть полезно, если
|
||||||
|
у вас сильно отличается время сетевого обращения от клиента к разным OSD - например,
|
||||||
|
если у вас несколько дата-центров.
|
||||||
|
|
||||||
|
Если данный параметр установлен в значение "nearest", клиенты будут стараться читать с
|
||||||
|
ближайших по [Дереву размещения](#дерево-размещения) OSD, то есть, с OSD с того же хоста
|
||||||
|
или датацентра. Расстояние до разных OSD будет рассчитываться с помощью имени хоста клиента,
|
||||||
|
определяемого автоматически или заданного вручную параметром [hostname](client.ru.md#hostname).
|
||||||
|
|
||||||
|
Если данный параметр установлен в значение "random", клиенты будут стараться распределять
|
||||||
|
запросы чтения по всем доступным вторичным OSD. Этот режим в основном полезен для тестов,
|
||||||
|
но, скорее всего, редко нужен в реальных инсталляциях.
|
||||||
|
|
||||||
|
Для работы локальных чтений требуются [блокировки PG](osd.ru.md#enable_pg_locks). Включать
|
||||||
|
их явно не нужно - они включаются автоматически для пулов с включёнными локальными чтениями.
|
||||||
|
|
||||||
## max_osd_combinations
|
## max_osd_combinations
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
|
@ -324,7 +349,8 @@ meow недоступен".
|
||||||
|
|
||||||
## immediate_commit
|
## immediate_commit
|
||||||
|
|
||||||
- Тип: строка "all", "small" или "none"
|
- Тип: строка
|
||||||
|
- Возможные значения: "all", "small" или "none"
|
||||||
- По умолчанию: none
|
- По умолчанию: none
|
||||||
|
|
||||||
Настройка мгновенного коммита для данного пула. Если не задана, используется
|
Настройка мгновенного коммита для данного пула. Если не задана, используется
|
||||||
|
|
|
@ -271,3 +271,15 @@
|
||||||
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
|
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
|
||||||
восстановить работу кластера после ошибок отсутствия свободного места
|
восстановить работу кластера после ошибок отсутствия свободного места
|
||||||
без уничтожения и пересоздания OSD.
|
без уничтожения и пересоздания OSD.
|
||||||
|
- name: hostname
|
||||||
|
type: string
|
||||||
|
online: true
|
||||||
|
info: |
|
||||||
|
Clients use host name to find their distance to OSDs when [localized reads](pool.en.md#local_reads)
|
||||||
|
are enabled. By default, standard [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html)
|
||||||
|
function is used to determine host name, but you can also override it with this parameter.
|
||||||
|
info_ru: |
|
||||||
|
Клиенты используют имя хоста для определения расстояния до OSD, когда включены
|
||||||
|
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
|
||||||
|
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
|
||||||
|
но вы также можете задать имя хоста вручную данным параметром.
|
||||||
|
|
|
@ -373,6 +373,55 @@
|
||||||
параметра читается без дополнительного копирования. Вы можете попробовать
|
параметра читается без дополнительного копирования. Вы можете попробовать
|
||||||
поменять этот параметр и посмотреть, как он влияет на производительность
|
поменять этот параметр и посмотреть, как он влияет на производительность
|
||||||
случайного и линейного доступа.
|
случайного и линейного доступа.
|
||||||
|
- name: min_zerocopy_send_size
|
||||||
|
type: int
|
||||||
|
default: 32768
|
||||||
|
info: |
|
||||||
|
OSDs and clients will attempt to use io_uring-based zero-copy TCP send
|
||||||
|
for buffers larger than this number of bytes. Zero-copy send with io_uring is
|
||||||
|
supported since Linux kernel version 6.1. Support is auto-detected and disabled
|
||||||
|
automatically when not available. It can also be disabled explicitly by setting
|
||||||
|
this parameter to a negative value.
|
||||||
|
|
||||||
|
⚠️ Warning! Zero-copy send performance may vary greatly from CPU to CPU and from
|
||||||
|
one kernel version to another. Generally, it tends to only make benefit with larger
|
||||||
|
messages. With smaller messages (say, 4 KB), it may actually be slower. 32 KB is
|
||||||
|
enough for almost all CPUs, but even smaller values are optimal for some of them.
|
||||||
|
For example, 4 KB is OK for EPYC Milan/Genoa and 12 KB is OK for Xeon Ice Lake
|
||||||
|
(but verify it yourself please).
|
||||||
|
|
||||||
|
Verification instructions:
|
||||||
|
1. Add `iommu=pt` into your Linux kernel command line and reboot.
|
||||||
|
2. Upgrade your kernel. For example, it's very important to use 6.11+ with recent AMD EPYCs.
|
||||||
|
3. Run some tests with the [send-zerocopy liburing example](https://github.com/axboe/liburing/blob/master/examples/send-zerocopy.c)
|
||||||
|
to find the minimal message size for which zero-copy is optimal.
|
||||||
|
Use `./send-zerocopy tcp -4 -R` at the server side and
|
||||||
|
`time ./send-zerocopy tcp -4 -b 0 -s BUFFER_SIZE -D SERVER_IP` at the client side with
|
||||||
|
`-z 0` (no zero-copy) and `-z 1` (zero-copy), and compare MB/s and used CPU time
|
||||||
|
(user+system).
|
||||||
|
info_ru: |
|
||||||
|
OSD и клиенты будут пробовать использовать TCP-отправку без копирования (zero-copy) на
|
||||||
|
основе io_uring для буферов, больших, чем это число байт. Отправка без копирования
|
||||||
|
поддерживается в io_uring, начиная с версии ядра Linux 6.1. Наличие поддержки
|
||||||
|
проверяется автоматически и zero-copy отключается, когда поддержки нет. Также
|
||||||
|
её можно отключить явно, установив данный параметр в отрицательное значение.
|
||||||
|
|
||||||
|
⚠️ Внимание! Производительность данной функции может сильно отличаться на разных
|
||||||
|
процессорах и на разных версиях ядра Linux. В целом, zero-copy обычно быстрее с
|
||||||
|
большими сообщениями, а с мелкими (например, 4 КБ) zero-copy может быть даже
|
||||||
|
медленнее. 32 КБ достаточно почти для всех процессоров, но для каких-то можно
|
||||||
|
использовать даже меньшие значения. Например, для EPYC Milan/Genoa подходит 4 КБ,
|
||||||
|
а для Xeon Ice Lake - 12 КБ (но, пожалуйста, перепроверьте это сами).
|
||||||
|
|
||||||
|
Инструкция по проверке:
|
||||||
|
1. Добавьте `iommu=pt` в командную строку загрузки вашего ядра Linux и перезагрузитесь.
|
||||||
|
2. Обновите ядро. Например, для AMD EPYC очень важно использовать версию 6.11+.
|
||||||
|
3. Позапускайте тесты с помощью [send-zerocopy из примеров liburing](https://github.com/axboe/liburing/blob/master/examples/send-zerocopy.c),
|
||||||
|
чтобы найти минимальный размер сообщения, для которого zero-copy отправка оптимальна.
|
||||||
|
Запускайте `./send-zerocopy tcp -4 -R` на стороне сервера и
|
||||||
|
`time ./send-zerocopy tcp -4 -b 0 -s РАЗМЕР_БУФЕРА -D АДРЕС_СЕРВЕРА` на стороне клиента
|
||||||
|
с опцией `-z 0` (обычная отправка) и `-z 1` (отправка без копирования), и сравнивайте
|
||||||
|
скорость в МБ/с и занятое процессорное время (user+system).
|
||||||
- name: use_sync_send_recv
|
- name: use_sync_send_recv
|
||||||
type: bool
|
type: bool
|
||||||
default: false
|
default: false
|
||||||
|
|
|
@ -781,3 +781,23 @@
|
||||||
неполными (incomplete), если упадут ещё какие-то OSD.
|
неполными (incomplete), если упадут ещё какие-то OSD.
|
||||||
|
|
||||||
Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
|
Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
|
||||||
|
- name: enable_pg_locks
|
||||||
|
type: bool
|
||||||
|
info: |
|
||||||
|
Vitastor 2.2.0 introduces a new layer of split-brain prevention mechanism in
|
||||||
|
addition to etcd: PG locks. They prevent split-brain even in abnormal theoretical cases
|
||||||
|
when etcd is extremely laggy. As a new feature, by default, PG locks are only enabled
|
||||||
|
for pools where they're required - pools with [localized reads](pool.en.md#local_reads).
|
||||||
|
Use this parameter to enable or disable this function for all pools.
|
||||||
|
info_ru: |
|
||||||
|
В Vitastor 2.2.0 появился новый слой защиты от сплитбрейна в дополнение к etcd -
|
||||||
|
блокировки PG. Они гарантируют порядок даже в теоретических ненормальных случаях,
|
||||||
|
когда etcd очень сильно тормозит. Так как функция новая, по умолчанию она включается
|
||||||
|
только для пулов, в которых она необходима - а именно, в пулах с включёнными
|
||||||
|
[локальными чтениями](pool.ru.md#local_reads). Ну а с помощью данного параметра
|
||||||
|
можно включить блокировки PG для всех пулов.
|
||||||
|
- name: pg_lock_retry_interval_ms
|
||||||
|
type: ms
|
||||||
|
default: 100
|
||||||
|
info: Retry interval for failed PG lock attempts.
|
||||||
|
info_ru: Интервал повтора неудачных попыток блокировки PG.
|
||||||
|
|
|
@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
|
||||||
The instruction is very simple.
|
The instruction is very simple.
|
||||||
|
|
||||||
1. Download a Docker image of the desired version: \
|
1. Download a Docker image of the desired version: \
|
||||||
`docker pull vitastor:2.1.0`
|
`docker pull vitastor:v2.2.0`
|
||||||
2. Install scripts to the host system: \
|
2. Install scripts to the host system: \
|
||||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:2.1.0 install.sh`
|
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
|
||||||
3. Reload udev rules: \
|
3. Reload udev rules: \
|
||||||
`udevadm control --reload-rules`
|
`udevadm control --reload-rules`
|
||||||
|
|
||||||
|
|
|
@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
|
||||||
Инструкция по установке максимально простая.
|
Инструкция по установке максимально простая.
|
||||||
|
|
||||||
1. Скачайте Docker-образ желаемой версии: \
|
1. Скачайте Docker-образ желаемой версии: \
|
||||||
`docker pull vitastor:2.1.0`
|
`docker pull vitastor:v2.2.0`
|
||||||
2. Установите скрипты в хост-систему командой: \
|
2. Установите скрипты в хост-систему командой: \
|
||||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:2.1.0 install.sh`
|
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
|
||||||
3. Перезагрузите правила udev: \
|
3. Перезагрузите правила udev: \
|
||||||
`udevadm control --reload-rules`
|
`udevadm control --reload-rules`
|
||||||
|
|
||||||
|
|
|
@ -125,6 +125,13 @@ All other client-side components are based on the client library:
|
||||||
all current read/write operations to it fail with EPIPE error and are retried by clients.
|
all current read/write operations to it fail with EPIPE error and are retried by clients.
|
||||||
- After completing all secondary read/write requests, primary OSD sends the response to
|
- After completing all secondary read/write requests, primary OSD sends the response to
|
||||||
the client.
|
the client.
|
||||||
|
- When [localized reads](../config/pool.en.md#local_reads) are enabled for a PG in a
|
||||||
|
replicated pool, and the PG is in an active and clean state (active or
|
||||||
|
active+left_on_dead), the client can send the request to one of secondary OSDs instead
|
||||||
|
of the primary. Secondary OSD checks the [PG lock](../config/osd.en.md#enable_pg_locks)
|
||||||
|
and handles the request locally without communicating to the primary. PG lock is required
|
||||||
|
for the secondary OSD to know for sure that the PG is in clean state and not switching
|
||||||
|
primary at the moment.
|
||||||
|
|
||||||
### Nuances of request handling
|
### Nuances of request handling
|
||||||
|
|
||||||
|
|
|
@ -125,6 +125,12 @@
|
||||||
и если любое из этих соединений отключается, PG перезапускается, а все текущие запросы чтения
|
и если любое из этих соединений отключается, PG перезапускается, а все текущие запросы чтения
|
||||||
и записи в неё завершаются с ошибкой EPIPE, после чего повторяются клиентами.
|
и записи в неё завершаются с ошибкой EPIPE, после чего повторяются клиентами.
|
||||||
- После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.
|
- После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.
|
||||||
|
- Если в реплицированном пуле включены [локализованные чтения](../config/pool.ru.md#local_reads),
|
||||||
|
а PG находится в чистом активном состоянии (active или active+left_on_dead), клиент может
|
||||||
|
послать запрос к одному из вторичных OSD вместо первичного. Вторичный OSD проверяет
|
||||||
|
[блокировку PG](../config/osd.ru.md#enable_pg_locks) и обрабатывает запрос локально, не
|
||||||
|
обращаясь к первичному. Блокировка PG здесь нужна, чтобы вторичный OSD мог точно знать,
|
||||||
|
что PG находится в чистом состоянии и не переключается на другой первичный OSD.
|
||||||
|
|
||||||
### Особенности обработки запросов
|
### Особенности обработки запросов
|
||||||
|
|
||||||
|
|
|
@ -10,8 +10,17 @@ Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
|
||||||
|
|
||||||
Join Vitastor Telegram Chat: https://t.me/vitastor
|
Join Vitastor Telegram Chat: https://t.me/vitastor
|
||||||
|
|
||||||
All server-side code (OSD, Monitor and so on) is licensed under the terms of
|
License: VNPL 1.1 for server-side code and dual VNPL 1.1 + GPL 2.0+ for client tools.
|
||||||
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
|
|
||||||
|
Server-side code is licensed only under the terms of VNPL.
|
||||||
|
|
||||||
|
Client libraries (cluster_client and so on) are dual-licensed under the same
|
||||||
|
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
||||||
|
software like QEMU and fio.
|
||||||
|
|
||||||
|
## VNPL
|
||||||
|
|
||||||
|
Vitastor Network Public License 1.1 (VNPL 1.1) is a copyleft license based on
|
||||||
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
|
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
|
||||||
opensourcing all programs directly or indirectly interacting with Vitastor
|
opensourcing all programs directly or indirectly interacting with Vitastor
|
||||||
through a computer network and expressly designed to be used in conjunction
|
through a computer network and expressly designed to be used in conjunction
|
||||||
|
@ -20,18 +29,83 @@ the terms of the same license, but also under the terms of any GPL-Compatible
|
||||||
Free Software License, as listed by the Free Software Foundation.
|
Free Software License, as listed by the Free Software Foundation.
|
||||||
This is a stricter copyleft license than the Affero GPL.
|
This is a stricter copyleft license than the Affero GPL.
|
||||||
|
|
||||||
Please note that VNPL doesn't require you to open the code of proprietary
|
The idea of VNPL is, in addition to modules linked to Vitastor code in a single
|
||||||
software running inside a VM if it's not specially designed to be used with
|
binary file, to extend copyleft action to micro-service modules only interacting
|
||||||
Vitastor.
|
with it over the network.
|
||||||
|
|
||||||
Basically, you can't use the software in a proprietary environment to provide
|
Basically, you can't use the software in a proprietary environment to provide
|
||||||
its functionality to users without opensourcing all intermediary components
|
its functionality to users without opensourcing all intermediary components
|
||||||
standing between the user and Vitastor or purchasing a commercial license
|
standing between the user and Vitastor or purchasing a commercial license
|
||||||
from the author 😀.
|
from the author 😀.
|
||||||
|
|
||||||
Client libraries (cluster_client and so on) are dual-licensed under the same
|
At the same time, VNPL doesn't impose any restrictions on software *not specially designed*
|
||||||
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
to be used with Vitastor, for example, on Windows running inside a VM with a Vitastor disk.
|
||||||
software like QEMU and fio.
|
|
||||||
|
|
||||||
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](../../VNPL-1.1.txt).
|
## Explanation
|
||||||
GPL 2.0 is also included in this repository as [GPL-2.0.txt](../../GPL-2.0.txt).
|
|
||||||
|
Network copyleft is governed by the clause **13. Remote Network Interaction** of VNPL.
|
||||||
|
|
||||||
|
A program is considered to be a "Proxy Program" if it meets both conditions:
|
||||||
|
- It is specially designed to be used with Vitastor. Basically, it means that the program
|
||||||
|
has any functionality specific to Vitastor and thus "knows" that it works with Vitastor,
|
||||||
|
not with something random.
|
||||||
|
- It interacts with Vitastor directly or indirectly through any programming interface,
|
||||||
|
including API, CLI, network or any wrapper (also considered a Proxy Program itself).
|
||||||
|
|
||||||
|
If, in addition to that:
|
||||||
|
- You give any user an apportunity to interact with Vitastor directly or indirectly through
|
||||||
|
any computer interface including the network or any number of wrappers (Proxy Programs).
|
||||||
|
|
||||||
|
Then VNPL requires you to publish the code of all above Proxy Programs to all above users
|
||||||
|
under the terms of any GPL-compatible license - that is, GPL, LGPL, MIT/BSD or Apache 2,
|
||||||
|
because "GPL compatibility" is treated as an ability to legally include licensed code in
|
||||||
|
a GPL application.
|
||||||
|
|
||||||
|
So, if you have a "Proxy Program", but it's not open to the user who directly or indirectly
|
||||||
|
interacts with Vitastor - you are forbidden to use Vitastor under the terms of VNPL and you
|
||||||
|
need a commercial license which doesn't contain open-source requirements.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
- Vitastor Kubernetes CSI driver which creates PersistentVolumes by calling `vitastor-cli create`.
|
||||||
|
- Yes, it interacts with Vitastor through vitastor-cli.
|
||||||
|
- Yes, it is designed specially for use with Vitastor (it has no sense otherwise).
|
||||||
|
- So, CSI driver **definitely IS** a Proxy Program and must be published under the terms of
|
||||||
|
a free software license.
|
||||||
|
- Windows, installed in a VM with the system disk on Vitastor storage.
|
||||||
|
- Yes, it interacts with Vitastor indirectly - it reads and writes data through the block
|
||||||
|
device interface, emulated by QEMU.
|
||||||
|
- No, it definitely isn't designed specially for use with Vitastor - Windows was created long
|
||||||
|
ago before Vitastor and doesn't know anything about it.
|
||||||
|
- So, Windows **definitely IS NOT** a Proxy Program and VNPL doesn't require to open it.
|
||||||
|
- Cloud control panel which makes requests to Vitastor Kubernetes CSI driver.
|
||||||
|
- Yes, it interacts with Vitastor indirectly through the CSI driver, which is a Proxy Program.
|
||||||
|
- May or may not be designed specially for use with Vitastor. How to determine exactly?
|
||||||
|
Imagine that Vitastor is replaced with any other storage (for example, with a proprietary).
|
||||||
|
Do control panel functions change in any way? If they do (for example, if snapshots stop working),
|
||||||
|
then the panel contains specific functionality and thus is designed specially for use with Vitastor.
|
||||||
|
Otherwise, the panel is universal and isn't designed specially for Vitastor.
|
||||||
|
- So, whether you are required to open-source the panel also **depends** on whether it
|
||||||
|
contains specific functionality or not.
|
||||||
|
|
||||||
|
## Why?
|
||||||
|
|
||||||
|
Because I believe into the spirit of copyleft (Linux wouldn't become so popular without GPL!)
|
||||||
|
and, at the same time, I want to have a way to monetize the product.
|
||||||
|
|
||||||
|
Existing licenses including AGPL are useless for it with an SDS - SDS is a very deeply
|
||||||
|
internal software which is almost definitely invisible to the user and thus AGPL doesn't
|
||||||
|
require anyone to open the code even if they make a proprietary fork.
|
||||||
|
|
||||||
|
And, in fact, the current situation in the world where GPL is though to only restrict direct
|
||||||
|
linking of programs into a single executable file, isn't much correct. Nowadays, programs
|
||||||
|
are more often linked with network API calls, not with /usr/bin/ld, and a software product
|
||||||
|
may consist of dozens of microservices interacting with each other over the network.
|
||||||
|
|
||||||
|
That's why we need VNPL to keep the license sufficiently copyleft.
|
||||||
|
|
||||||
|
## License Texts
|
||||||
|
|
||||||
|
- VNPL 1.1 in English: [VNPL-1.1.txt](../../VNPL-1.1.txt)
|
||||||
|
- VNPL 1.1 in Russian: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
|
||||||
|
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
|
||||||
|
|
|
@ -12,6 +12,14 @@
|
||||||
|
|
||||||
Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
|
Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
|
||||||
|
|
||||||
|
Серверные компоненты распространяются только на условиях VNPL.
|
||||||
|
|
||||||
|
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
|
||||||
|
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
|
||||||
|
совместимости с таким ПО, как QEMU и fio.
|
||||||
|
|
||||||
|
## VNPL
|
||||||
|
|
||||||
VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
|
VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
|
||||||
Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
|
Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
|
||||||
условием "Сетевого взаимодействия", требующим распространять все программы,
|
условием "Сетевого взаимодействия", требующим распространять все программы,
|
||||||
|
@ -29,9 +37,70 @@ Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с д
|
||||||
На Windows и любое другое ПО, не разработанное *специально* для использования
|
На Windows и любое другое ПО, не разработанное *специально* для использования
|
||||||
вместе с Vitastor, никакие ограничения не накладываются.
|
вместе с Vitastor, никакие ограничения не накладываются.
|
||||||
|
|
||||||
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
|
## Пояснение
|
||||||
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
|
|
||||||
совместимости с таким ПО, как QEMU и fio.
|
|
||||||
|
|
||||||
Вы можете найти полный текст VNPL 1.1 на английском языке в файле [VNPL-1.1.txt](../../VNPL-1.1.txt),
|
Сетевой копилефт регулируется пунктом лицензии **13. Удалённое сетевое взаимодействие**.
|
||||||
VNPL 1.1 на русском языке в файле [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt), а GPL 2.0 в файле [GPL-2.0.txt](../../GPL-2.0.txt).
|
|
||||||
|
Программа считается "прокси-программой", если верны оба условия:
|
||||||
|
- Она создана специально для работы вместе с Vitastor. По сути это означает, что программа
|
||||||
|
должна иметь специфичный для Vitastor функционал, то есть, "знать", что она взаимодействует
|
||||||
|
именно с Vitastor.
|
||||||
|
- Она прямо или косвенно взаимодействует с Vitastor через абсолютно любой программный
|
||||||
|
интерфейс, включая любые способы вызова: API, CLI, сеть или через какую-то обёртку (в
|
||||||
|
свою очередь тоже являющуюся прокси-программой).
|
||||||
|
|
||||||
|
Если в дополнение к этому также:
|
||||||
|
- Вы предоставляете любому пользователю возможность взаимодействовать с Vitastor по сети,
|
||||||
|
опять-таки, через любой интерфейс или любую серию "обёрток" (прокси-программ)
|
||||||
|
|
||||||
|
То, согласно VNPL, вы должны открыть код "прокси-программ" **таким пользователям** на условиях
|
||||||
|
любой GPL-совместимой лицензии - то есть, GPL, LGPL, MIT/BSD или Apache 2 - "совместимость с GPL"
|
||||||
|
понимается как возможность включать лицензируемый код в GPL-приложение.
|
||||||
|
|
||||||
|
Соответственно, если у вас есть "прокси-программа", но её код не открыт пользователю,
|
||||||
|
который прямо или косвенно взаимодействует с Vitastor - вам запрещено использовать Vitastor
|
||||||
|
на условиях VNPL и вам нужна коммерческая лицензия, не содержащая требований об открытии кода.
|
||||||
|
|
||||||
|
## Примеры
|
||||||
|
|
||||||
|
- Kubernetes CSI-драйвер Vitastor, создающий PersistentVolume с помощью вызова `vitastor-cli create`.
|
||||||
|
- Да, взаимодействует с Vitastor через vitastor-cli.
|
||||||
|
- Да, создавался специально для работы с Vitastor (иначе в чём же ещё его смысл).
|
||||||
|
- Значит, CSI-драйвер **точно считается** "прокси-программой" и должен быть открыт под свободной
|
||||||
|
лицензией.
|
||||||
|
- Windows, установленный в виртуальную машину на диске Vitastor.
|
||||||
|
- Да, взаимодействует с Vitastor "прямо или косвенно" - пишет и читает данные через интерфейс
|
||||||
|
блочного устройства, эмулируемый QEMU.
|
||||||
|
- Нет, точно не создан *специально для работы с Vitastor* - когда его создавали, никакого
|
||||||
|
Vitastor ещё и в помине не было.
|
||||||
|
- Значит, Windows **точно не считается** "прокси-программой" и на него требования VNPL не распространяются.
|
||||||
|
- Панель управления облака, делающая запросы к Kubernetes CSI-драйверу Vitastor.
|
||||||
|
- Да, взаимодействует с Vitastor косвенно через CSI-драйвер, являющийся "прокси-программой".
|
||||||
|
- Сходу не известно, создавалась ли конкретно для работы с Vitastor. Как понять, да или нет?
|
||||||
|
Представьте, что Vitastor заменён на любую другую систему хранения (например, на проприетарную).
|
||||||
|
Работа панели управления изменится? Если да (например, перестанут работать снапшоты) - значит,
|
||||||
|
панель содержит специфичный функционал и "создана специально для работы с Vitastor".
|
||||||
|
Если нет - значит, специфичного функционала панель не содержит и в принципе она универсальна.
|
||||||
|
- Нужно ли открывать панель - **зависит** от того, содержит она специфичный функционал или нет.
|
||||||
|
|
||||||
|
## Почему так?
|
||||||
|
|
||||||
|
Потому что я одновременно верю в дух копилефт-лицензий (Linux не стал бы так популярен,
|
||||||
|
если бы не GPL!) и хочу иметь возможность монетизации продукта.
|
||||||
|
|
||||||
|
При этом использовать даже AGPL для программной СХД бессмысленно - это глубоко внутреннее
|
||||||
|
ПО, которое пользователь почти наверняка не увидит вообще, поэтому и открывать код никому
|
||||||
|
никогда не придётся, даже при создании производного продукта.
|
||||||
|
|
||||||
|
Да и в целом сложившаяся в мире ситуация, при которой действие GPL ограничивается только
|
||||||
|
прямым связыванием в один исполняемый файл, не очень корректна. В настоящее время программы
|
||||||
|
гораздо чаще интегрируют сетевыми вызовами, а не с помощью /usr/bin/ld, и общий программный
|
||||||
|
продукт может состоять из нескольких десятков микросервисов, взаимодействующих по сети.
|
||||||
|
|
||||||
|
Поэтому для сохранения достаточной "копилефтности" и придумана VNPL.
|
||||||
|
|
||||||
|
## Тексты лицензий
|
||||||
|
|
||||||
|
- VNPL 1.1 на английском языке: [VNPL-1.1.txt](../../VNPL-1.1.txt)
|
||||||
|
- VNPL 1.1 на русском языке: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
|
||||||
|
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
- Recovery of degraded blocks
|
- Recovery of degraded blocks
|
||||||
- Rebalancing (data movement between OSDs)
|
- Rebalancing (data movement between OSDs)
|
||||||
- [Lazy fsync support](../config/layout-cluster.en.md#immediate_commit)
|
- [Lazy fsync support](../config/layout-cluster.en.md#immediate_commit)
|
||||||
|
- [Localized read support](../config/pool.en.md#local_reads) for cross-datacenter setup optimization
|
||||||
- Per-OSD and per-image I/O and space usage statistics in etcd
|
- Per-OSD and per-image I/O and space usage statistics in etcd
|
||||||
- Snapshots and copy-on-write image clones
|
- Snapshots and copy-on-write image clones
|
||||||
- [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
|
- [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
- Восстановление деградированных блоков
|
- Восстановление деградированных блоков
|
||||||
- Ребаланс, то есть перемещение данных между OSD (дисками)
|
- Ребаланс, то есть перемещение данных между OSD (дисками)
|
||||||
- [Поддержка "ленивого" fsync (fsync не на каждую операцию)](../config/layout-cluster.ru.md#immediate_commit)
|
- [Поддержка "ленивого" fsync (fsync не на каждую операцию)](../config/layout-cluster.ru.md#immediate_commit)
|
||||||
|
- [Локальные чтения](../config/pool.ru.md#local_reads) для оптимизации при нескольких датацентрах
|
||||||
- Сбор статистики ввода/вывода в etcd
|
- Сбор статистики ввода/вывода в etcd
|
||||||
- Статистика операций ввода/вывода и занятого места в разрезе инодов
|
- Статистика операций ввода/вывода и занятого места в разрезе инодов
|
||||||
- Именование инодов через хранение их метаданных в etcd
|
- Именование инодов через хранение их метаданных в etcd
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
- [Removing a failed disk](#removing-a-failed-disk)
|
- [Removing a failed disk](#removing-a-failed-disk)
|
||||||
- [Adding a disk](#adding-a-disk)
|
- [Adding a disk](#adding-a-disk)
|
||||||
- [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration)
|
- [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration)
|
||||||
|
- [Incompatibility problems](#Incompatibility-problems)
|
||||||
- [Upgrading Vitastor](#upgrading-vitastor)
|
- [Upgrading Vitastor](#upgrading-vitastor)
|
||||||
- [OSD memory usage](#osd-memory-usage)
|
- [OSD memory usage](#osd-memory-usage)
|
||||||
|
|
||||||
|
@ -166,6 +167,17 @@ done
|
||||||
|
|
||||||
After that all PGs should peer and find all previous data.
|
After that all PGs should peer and find all previous data.
|
||||||
|
|
||||||
|
## Incompatibility problems
|
||||||
|
|
||||||
|
### ISA-L 2.31
|
||||||
|
|
||||||
|
⚠ It is FORBIDDEN to use Vitastor 2.1.0 and earlier versions with ISA-L 2.31 and newer if
|
||||||
|
you use EC N+K pools and K > 1 on a CPU with GF-NI instruction support, because it WILL
|
||||||
|
lead to **data loss** during EC recovery.
|
||||||
|
|
||||||
|
If you accidentally upgraded ISA-L to 2.31 but didn't upgrade Vitastor and restarted OSDs,
|
||||||
|
then stop them as soon as possible and either update Vitastor or roll back ISA-L.
|
||||||
|
|
||||||
## Upgrading Vitastor
|
## Upgrading Vitastor
|
||||||
|
|
||||||
Every upcoming Vitastor version is usually compatible with previous both forward
|
Every upcoming Vitastor version is usually compatible with previous both forward
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
- [Удаление неисправного диска](#удаление-неисправного-диска)
|
- [Удаление неисправного диска](#удаление-неисправного-диска)
|
||||||
- [Добавление диска](#добавление-диска)
|
- [Добавление диска](#добавление-диска)
|
||||||
- [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов)
|
- [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов)
|
||||||
|
- [Проблемы несовместимости](#проблемы-несовместимости)
|
||||||
- [Обновление Vitastor](#обновление-vitastor)
|
- [Обновление Vitastor](#обновление-vitastor)
|
||||||
- [Потребление памяти OSD](#потребление-памяти-osd)
|
- [Потребление памяти OSD](#потребление-памяти-osd)
|
||||||
|
|
||||||
|
@ -163,6 +164,17 @@ done
|
||||||
|
|
||||||
После этого все PG должны пройти peering и найти все предыдущие данные.
|
После этого все PG должны пройти peering и найти все предыдущие данные.
|
||||||
|
|
||||||
|
## Проблемы несовместимости
|
||||||
|
|
||||||
|
### ISA-L 2.31
|
||||||
|
|
||||||
|
⚠ ЗАПРЕЩЕНО использовать Vitastor 2.1.0 и более ранних версий с библиотекой ISA-L версии 2.31
|
||||||
|
или более новой, если вы используете EC-пулы N+K и K > 1 на CPU с поддержкой инструкций GF-NI,
|
||||||
|
так как это приведёт к **потере данных** при восстановлении из EC.
|
||||||
|
|
||||||
|
Если вы случайно обновили ISA-L до 2.31, но не обновили Vitastor, и успели перезапустить OSD,
|
||||||
|
то как можно скорее остановите их все и либо обновите Vitastor, либо откатите ISA-L.
|
||||||
|
|
||||||
## Обновление Vitastor
|
## Обновление Vitastor
|
||||||
|
|
||||||
Обычно каждая следующая версия Vitastor совместима с предыдущими и "вперёд", и "назад"
|
Обычно каждая следующая версия Vitastor совместима с предыдущими и "вперёд", и "назад"
|
||||||
|
|
|
@ -397,6 +397,7 @@ Optional parameters:
|
||||||
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
|
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
|
||||||
| `--level_placement <rules>` | Use additional failure domain rules (example: "dc=112233") |
|
| `--level_placement <rules>` | Use additional failure domain rules (example: "dc=112233") |
|
||||||
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
|
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
|
||||||
|
| `--local_reads primary` | Local read policy for replicated pools: primary, nearest or random |
|
||||||
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
||||||
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
||||||
| `--used_for_app fs:<name>` | Mark pool as used for VitastorFS with metadata in image `<name>` |
|
| `--used_for_app fs:<name>` | Mark pool as used for VitastorFS with metadata in image `<name>` |
|
||||||
|
|
|
@ -414,6 +414,7 @@ OSD PARENT UP SIZE USED% TAGS WEIGHT BLOCK BITMAP
|
||||||
| `--immediate_commit none` | ...только OSD с этим или большим immediate_commit (none < small < all) |
|
| `--immediate_commit none` | ...только OSD с этим или большим immediate_commit (none < small < all) |
|
||||||
| `--level_placement <rules>` | Задать правила дополнительных доменов отказа (пример: "dc=112233") |
|
| `--level_placement <rules>` | Задать правила дополнительных доменов отказа (пример: "dc=112233") |
|
||||||
| `--raw_placement <rules>` | Задать низкоуровневые правила генерации PG ([детали](../config/pool.ru.md#raw_placement)) |
|
| `--raw_placement <rules>` | Задать низкоуровневые правила генерации PG ([детали](../config/pool.ru.md#raw_placement)) |
|
||||||
|
| `--local_reads primary` | Политика локальных чтений для реплик: primary, nearest или random |
|
||||||
| `--primary_affinity_tags tags` | Предпочитать OSD со всеми данными тегами для роли первичных |
|
| `--primary_affinity_tags tags` | Предпочитать OSD со всеми данными тегами для роли первичных |
|
||||||
| `--scrub_interval <time>` | Включить скрабы с заданным интервалом времени (число + единица s/m/h/d/M/y) |
|
| `--scrub_interval <time>` | Включить скрабы с заданным интервалом времени (число + единица s/m/h/d/M/y) |
|
||||||
| `--pg_stripe_size <number>` | Увеличить блок группировки объектов по PG |
|
| `--pg_stripe_size <number>` | Увеличить блок группировки объектов по PG |
|
||||||
|
|
|
@ -14,6 +14,9 @@ Commands:
|
||||||
- [upgrade](#upgrade)
|
- [upgrade](#upgrade)
|
||||||
- [defrag](#defrag)
|
- [defrag](#defrag)
|
||||||
|
|
||||||
|
⚠️ Important: follow the instructions from [Linux NFS write size](#linux-nfs-write-size)
|
||||||
|
for optimal Vitastor NFS performance if you use EC and HDD and mount your NFS from Linux.
|
||||||
|
|
||||||
## Pseudo-FS
|
## Pseudo-FS
|
||||||
|
|
||||||
Simplified pseudo-FS proxy is used for file-based image access emulation. It's not
|
Simplified pseudo-FS proxy is used for file-based image access emulation. It's not
|
||||||
|
@ -100,6 +103,62 @@ Other notable missing features which should be addressed in the future:
|
||||||
in the DB. The FS is implemented is such way that this garbage doesn't affect its
|
in the DB. The FS is implemented is such way that this garbage doesn't affect its
|
||||||
function, but having a tool to clean it up still seems a right thing to do.
|
function, but having a tool to clean it up still seems a right thing to do.
|
||||||
|
|
||||||
|
## Linux NFS write size
|
||||||
|
|
||||||
|
Linux NFS client (nfs/nfsv3/nfsv4 kernel modules) has a hard-coded maximum I/O size,
|
||||||
|
currently set to 1 MB - see `rsize` and `wsize` in [man 5 nfs](https://linux.die.net/man/5/nfs).
|
||||||
|
|
||||||
|
This means that when you write to a file in an FS mounted over NFS, the maximum write
|
||||||
|
request size is 1 MB, even in the O_DIRECT mode and even if the original write request
|
||||||
|
is larger.
|
||||||
|
|
||||||
|
However, for optimal linear write performance in Vitastor EC (erasure-coded) pools,
|
||||||
|
the size of write requests should be a multiple of [block_size](../config/layout-cluster.en.md#block_size),
|
||||||
|
multiplied by the data chunk count of the pool ([pg_size](../config/pool.en.md#pg_size)-[parity_chunks](../config/pool.en.md#parity_chunks)).
|
||||||
|
When write requests are smaller or not a multiple of this number, Vitastor has to first
|
||||||
|
read paired data blocks from disks, calculate new parity blocks and only then write them
|
||||||
|
back. Obviously this is 2-3 times slower than a simple disk write.
|
||||||
|
|
||||||
|
Vitastor HDD setups use 1 MB block_size by default. So, for optimal performance, if
|
||||||
|
you use EC 2+1 and HDD, you need your NFS client to send 2 MB write requests, if you
|
||||||
|
use EC 4+1 - 4 MB and so on.
|
||||||
|
|
||||||
|
But Linux NFS client only writes in 1 MB chunks. 😢
|
||||||
|
|
||||||
|
The good news is that you can fix it by rebuilding Linux NFS kernel modules 😉 🤩!
|
||||||
|
You need to change NFS_MAX_FILE_IO_SIZE in nfs_xdr.h and then rebuild and reload modules.
|
||||||
|
|
||||||
|
The instruction, using Debian as an example (should be ran under root):
|
||||||
|
|
||||||
|
```
|
||||||
|
# download current Linux kernel headers required to build modules
|
||||||
|
apt-get install linux-headers-`uname -r`
|
||||||
|
|
||||||
|
# replace NFS_MAX_FILE_IO_SIZE with a desired number (here it's 4194304 - 4 MB)
|
||||||
|
sed -i 's/NFS_MAX_FILE_IO_SIZE\s*.*/NFS_MAX_FILE_IO_SIZE\t(4194304U)/' /lib/modules/`uname -r`/source/include/linux/nfs_xdr.h
|
||||||
|
|
||||||
|
# download current Linux kernel source
|
||||||
|
mkdir linux_src
|
||||||
|
cd linux_src
|
||||||
|
apt-get source linux-image-`uname -r`-unsigned
|
||||||
|
|
||||||
|
# build NFS modules
|
||||||
|
cd linux-*/fs/nfs
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD -j8 modules
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD modules_install
|
||||||
|
|
||||||
|
# move default NFS modules away
|
||||||
|
mv /lib/modules/`uname -r`/kernel/fs/nfs ~/nfs_orig_`uname -r`
|
||||||
|
depmod -a
|
||||||
|
|
||||||
|
# unload old modules and load the new ones
|
||||||
|
rmmod nfsv3 nfs
|
||||||
|
modprobe nfsv3
|
||||||
|
```
|
||||||
|
|
||||||
|
After these (not much complicated 🙂) manipulations NFS begins to be mounted
|
||||||
|
with new wsize and rsize by default and it fixes Vitastor-NFS linear write performance.
|
||||||
|
|
||||||
## Horizontal scaling
|
## Horizontal scaling
|
||||||
|
|
||||||
Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't
|
Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't
|
||||||
|
|
|
@ -14,6 +14,9 @@
|
||||||
- [upgrade](#upgrade)
|
- [upgrade](#upgrade)
|
||||||
- [defrag](#defrag)
|
- [defrag](#defrag)
|
||||||
|
|
||||||
|
⚠️ Важно: для оптимальной производительности Vitastor NFS в Linux при использовании
|
||||||
|
HDD и EC (erasure кодов) выполните инструкции из раздела [Размер записи Linux NFS](#размер-записи-linux-nfs).
|
||||||
|
|
||||||
## Псевдо-ФС
|
## Псевдо-ФС
|
||||||
|
|
||||||
Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным
|
Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным
|
||||||
|
@ -104,6 +107,66 @@ JSON-формате :-). Для инспекции содержимого БД
|
||||||
записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит
|
записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит
|
||||||
уметь подчищать.
|
уметь подчищать.
|
||||||
|
|
||||||
|
## Размер записи Linux NFS
|
||||||
|
|
||||||
|
Клиент Linux NFS (модули ядра nfs/nfsv3/nfsv4) имеет фиксированный в коде максимальный
|
||||||
|
размер запроса ввода-вывода, равный 1 МБ - см. `rsize` и `wsize` в [man 5 nfs](https://linux.die.net/man/5/nfs).
|
||||||
|
|
||||||
|
Это означает, что когда вы записываете в файл в примонтированной по NFS файловой системе,
|
||||||
|
максимальный размер запроса записи составляет 1 МБ, даже в режиме O_DIRECT и даже если
|
||||||
|
исходный запрос записи был больше.
|
||||||
|
|
||||||
|
Однако для оптимальной скорости линейной записи в Vitastor при использовании EC-пулов
|
||||||
|
(пулов с кодами коррекции ошибок) запросы записи должны быть по размеру кратны
|
||||||
|
[block_size](../config/layout-cluster.ru.md#block_size), умноженному на число частей
|
||||||
|
данных пула ([pg_size](../config/pool.ru.md#pg_size)-[parity_chunks](../config/pool.ru.md#parity_chunks)).
|
||||||
|
Если запросы записи меньше или не кратны, то Vitastor приходится сначала прочитать
|
||||||
|
с дисков старые версии парных блоков данных, рассчитать новые блоки чётности и только
|
||||||
|
после этого записать их на диски. Естественно, это в 2-3 раза медленнее простой записи
|
||||||
|
на диск.
|
||||||
|
|
||||||
|
При этом block_size на жёстких дисках по умолчанию устанавливается равным 1 МБ.
|
||||||
|
Таким образом, если вы используете EC 2+1 и HDD, для оптимальной скорости записи вам
|
||||||
|
нужно, чтобы NFS-клиент писал по 2 МБ, если EC 4+1 и HDD - то по 4 МБ, и т.п.
|
||||||
|
|
||||||
|
А Linux NFS-клиент пишет только по 1 МБ. 😢
|
||||||
|
|
||||||
|
Но это можно исправить, пересобрав модули ядра Linux NFS 😉 🤩! Для этого нужно
|
||||||
|
поменять значение переменной NFS_MAX_FILE_IO_SIZE в заголовочном файле nfs_xdr.h,
|
||||||
|
после чего пересобрать модули NFS.
|
||||||
|
|
||||||
|
Инструкция по пересборке на примере Debian (выполнять под root):
|
||||||
|
|
||||||
|
```
|
||||||
|
# скачиваем заголовки для сборки модулей для текущего ядра Linux
|
||||||
|
apt-get install linux-headers-`uname -r`
|
||||||
|
|
||||||
|
# заменяем в заголовках NFS_MAX_FILE_IO_SIZE на желаемый (здесь 4194304 - 4 МБ)
|
||||||
|
sed -i 's/NFS_MAX_FILE_IO_SIZE\s*.*/NFS_MAX_FILE_IO_SIZE\t(4194304U)/' /lib/modules/`uname -r`/source/include/linux/nfs_xdr.h
|
||||||
|
|
||||||
|
# скачиваем исходный код текущего ядра
|
||||||
|
mkdir linux_src
|
||||||
|
cd linux_src
|
||||||
|
apt-get source linux-image-`uname -r`-unsigned
|
||||||
|
|
||||||
|
# собираем модули NFS
|
||||||
|
cd linux-*/fs/nfs
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD -j8 modules
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD modules_install
|
||||||
|
|
||||||
|
# убираем в сторону штатные модули NFS
|
||||||
|
mv /lib/modules/`uname -r`/kernel/fs/nfs ~/nfs_orig_`uname -r`
|
||||||
|
depmod -a
|
||||||
|
|
||||||
|
# выгружаем старые модули и загружаем новые
|
||||||
|
rmmod nfsv3 nfs
|
||||||
|
modprobe nfsv3
|
||||||
|
```
|
||||||
|
|
||||||
|
После такой (относительно нехитрой 🙂) манипуляции NFS начинает по умолчанию
|
||||||
|
монтироваться с новыми wsize и rsize, и производительность линейной записи в Vitastor-NFS
|
||||||
|
исправляется.
|
||||||
|
|
||||||
## Горизонтальное масштабирование
|
## Горизонтальное масштабирование
|
||||||
|
|
||||||
Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость.
|
Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость.
|
||||||
|
|
|
@ -162,10 +162,12 @@ apt-get install linux-headers-`uname -r`
|
||||||
apt-get build-dep linux-image-`uname -r`-unsigned
|
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||||
apt-get source linux-image-`uname -r`-unsigned
|
apt-get source linux-image-`uname -r`-unsigned
|
||||||
cd linux*/drivers/vdpa
|
cd linux*/drivers/vdpa
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||||
cd ../virtio
|
cd ../virtio
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||||
depmod -a
|
depmod -a
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -165,10 +165,12 @@ apt-get install linux-headers-`uname -r`
|
||||||
apt-get build-dep linux-image-`uname -r`-unsigned
|
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||||
apt-get source linux-image-`uname -r`-unsigned
|
apt-get source linux-image-`uname -r`-unsigned
|
||||||
cd linux*/drivers/vdpa
|
cd linux*/drivers/vdpa
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||||
cd ../virtio
|
cd ../virtio
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||||
depmod -a
|
depmod -a
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -253,7 +253,7 @@ function random_custom_combinations(osd_tree, rules, count, ordered)
|
||||||
for (let i = 1; i < rules.length; i++)
|
for (let i = 1; i < rules.length; i++)
|
||||||
{
|
{
|
||||||
const filtered = filter_tree_by_rules(osd_tree, rules[i], selected);
|
const filtered = filter_tree_by_rules(osd_tree, rules[i], selected);
|
||||||
const idx = select_murmur3(filtered.length, i => 'p:'+f.id+':'+filtered[i].id);
|
const idx = select_murmur3(filtered.length, i => 'p:'+f.id+':'+(filtered[i].name || filtered[i].id));
|
||||||
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
|
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
|
||||||
}
|
}
|
||||||
const size = selected.filter(s => s.id !== null).length;
|
const size = selected.filter(s => s.id !== null).length;
|
||||||
|
@ -270,7 +270,7 @@ function random_custom_combinations(osd_tree, rules, count, ordered)
|
||||||
for (const item_rules of rules)
|
for (const item_rules of rules)
|
||||||
{
|
{
|
||||||
const filtered = selected.length ? filter_tree_by_rules(osd_tree, item_rules, selected) : first;
|
const filtered = selected.length ? filter_tree_by_rules(osd_tree, item_rules, selected) : first;
|
||||||
const idx = select_murmur3(filtered.length, i => n+':'+filtered[i].id);
|
const idx = select_murmur3(filtered.length, i => n+':'+(filtered[i].name || filtered[i].id));
|
||||||
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
|
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
|
||||||
}
|
}
|
||||||
const size = selected.filter(s => s.id !== null).length;
|
const size = selected.filter(s => s.id !== null).length;
|
||||||
|
@ -340,9 +340,9 @@ function filter_tree_by_rules(osd_tree, rules, selected)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert from
|
// Convert from
|
||||||
// node_list = { id: string|number, level: string, size?: number, parent?: string|number }[]
|
// node_list = { id: string|number, name?: string, level: string, size?: number, parent?: string|number }[]
|
||||||
// to
|
// to
|
||||||
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node_id[], levels: { [level]: id, ... } } }
|
// node_tree = { [node_id]: { id, name?, level, size?, parent?, children?: child_node[], levels: { [level]: id, ... } } }
|
||||||
function index_tree(node_list)
|
function index_tree(node_list)
|
||||||
{
|
{
|
||||||
const tree = { '': { children: [], levels: {} } };
|
const tree = { '': { children: [], levels: {} } };
|
||||||
|
@ -357,7 +357,7 @@ function index_tree(node_list)
|
||||||
tree[parent_id].children = tree[parent_id].children || [];
|
tree[parent_id].children = tree[parent_id].children || [];
|
||||||
tree[parent_id].children.push(tree[node.id]);
|
tree[parent_id].children.push(tree[node.id]);
|
||||||
}
|
}
|
||||||
const cur = tree[''].children;
|
const cur = [ ...tree[''].children ];
|
||||||
for (let i = 0; i < cur.length; i++)
|
for (let i = 0; i < cur.length; i++)
|
||||||
{
|
{
|
||||||
cur[i].levels[cur[i].level] = cur[i].id;
|
cur[i].levels[cur[i].level] = cur[i].id;
|
||||||
|
|
|
@ -0,0 +1,244 @@
|
||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
|
// Extract OSDs from the lowest affected tree level into a separate (flat) map
|
||||||
|
// to run PG optimisation on failure domains instead of individual OSDs
|
||||||
|
//
|
||||||
|
// node_list = same input as for index_tree()
|
||||||
|
// rules = [ level, operator, value ][][]
|
||||||
|
// returns { nodes: new_node_list, leaves: { new_folded_node_id: [ extracted_leaf_nodes... ] } }
|
||||||
|
function fold_failure_domains(node_list, rules)
|
||||||
|
{
|
||||||
|
const interest = {};
|
||||||
|
for (const level_rules of rules)
|
||||||
|
{
|
||||||
|
for (const rule of level_rules)
|
||||||
|
interest[rule[0]] = true;
|
||||||
|
}
|
||||||
|
const max_numeric_id = node_list.reduce((a, c) => a < (0|c.id) ? (0|c.id) : a, 0);
|
||||||
|
let next_id = max_numeric_id;
|
||||||
|
const node_map = node_list.reduce((a, c) => { a[c.id||''] = c; return a; }, {});
|
||||||
|
const old_ids_by_new = {};
|
||||||
|
const extracted_nodes = {};
|
||||||
|
let folded = true;
|
||||||
|
while (folded)
|
||||||
|
{
|
||||||
|
const per_parent = {};
|
||||||
|
for (const node_id in node_map)
|
||||||
|
{
|
||||||
|
const node = node_map[node_id];
|
||||||
|
const p = node.parent || '';
|
||||||
|
per_parent[p] = per_parent[p]||[];
|
||||||
|
per_parent[p].push(node);
|
||||||
|
}
|
||||||
|
folded = false;
|
||||||
|
for (const node_id in per_parent)
|
||||||
|
{
|
||||||
|
const fold_node = node_id !== '' && per_parent[node_id].length > 0 && per_parent[node_id].filter(child => per_parent[child.id||''] || interest[child.level]).length == 0;
|
||||||
|
if (fold_node)
|
||||||
|
{
|
||||||
|
const old_node = node_map[node_id];
|
||||||
|
const new_id = ++next_id;
|
||||||
|
node_map[new_id] = {
|
||||||
|
...old_node,
|
||||||
|
id: new_id,
|
||||||
|
name: node_id, // for use in murmur3 hashes
|
||||||
|
size: per_parent[node_id].reduce((a, c) => a + (Number(c.size)||0), 0),
|
||||||
|
};
|
||||||
|
delete node_map[node_id];
|
||||||
|
old_ids_by_new[new_id] = node_id;
|
||||||
|
extracted_nodes[new_id] = [];
|
||||||
|
for (const child of per_parent[node_id])
|
||||||
|
{
|
||||||
|
if (old_ids_by_new[child.id])
|
||||||
|
{
|
||||||
|
extracted_nodes[new_id].push(...extracted_nodes[child.id]);
|
||||||
|
delete extracted_nodes[child.id];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
extracted_nodes[new_id].push(child);
|
||||||
|
delete node_map[child.id];
|
||||||
|
}
|
||||||
|
folded = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { nodes: Object.values(node_map), leaves: extracted_nodes };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Distribute PGs mapped to "folded" nodes to individual OSDs according to their weights
|
||||||
|
// folded_pgs = optimize_result.int_pgs before folding
|
||||||
|
// prev_pgs = optional previous PGs from optimize_change() input
|
||||||
|
// extracted_nodes = output from fold_failure_domains
|
||||||
|
function unfold_failure_domains(folded_pgs, prev_pgs, extracted_nodes)
|
||||||
|
{
|
||||||
|
const maps = {};
|
||||||
|
let found = false;
|
||||||
|
for (const new_id in extracted_nodes)
|
||||||
|
{
|
||||||
|
const weights = {};
|
||||||
|
for (const sub_node of extracted_nodes[new_id])
|
||||||
|
{
|
||||||
|
weights[sub_node.id] = sub_node.size;
|
||||||
|
}
|
||||||
|
maps[new_id] = { weights, prev: [], next: [], pos: 0 };
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
return folded_pgs;
|
||||||
|
}
|
||||||
|
for (let i = 0; i < folded_pgs.length; i++)
|
||||||
|
{
|
||||||
|
for (let j = 0; j < folded_pgs[i].length; j++)
|
||||||
|
{
|
||||||
|
if (maps[folded_pgs[i][j]])
|
||||||
|
{
|
||||||
|
maps[folded_pgs[i][j]].prev.push(prev_pgs && prev_pgs[i] && prev_pgs[i][j] || 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (const new_id in maps)
|
||||||
|
{
|
||||||
|
maps[new_id].next = adjust_distribution(maps[new_id].weights, maps[new_id].prev);
|
||||||
|
}
|
||||||
|
const mapped_pgs = [];
|
||||||
|
for (let i = 0; i < folded_pgs.length; i++)
|
||||||
|
{
|
||||||
|
mapped_pgs.push(folded_pgs[i].map(osd => (maps[osd] ? maps[osd].next[maps[osd].pos++] : osd)));
|
||||||
|
}
|
||||||
|
return mapped_pgs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the new array of items re-distributed as close as possible to weights in wanted_weights
|
||||||
|
// wanted_weights = { [key]: weight }
|
||||||
|
// cur_items = key[]
|
||||||
|
function adjust_distribution(wanted_weights, cur_items)
|
||||||
|
{
|
||||||
|
const item_map = {};
|
||||||
|
for (let i = 0; i < cur_items.length; i++)
|
||||||
|
{
|
||||||
|
const item = cur_items[i];
|
||||||
|
item_map[item] = (item_map[item] || { target: 0, cur: [] });
|
||||||
|
item_map[item].cur.push(i);
|
||||||
|
}
|
||||||
|
let total_weight = 0;
|
||||||
|
for (const item in wanted_weights)
|
||||||
|
{
|
||||||
|
total_weight += Number(wanted_weights[item]) || 0;
|
||||||
|
}
|
||||||
|
for (const item in wanted_weights)
|
||||||
|
{
|
||||||
|
const weight = wanted_weights[item] / total_weight * cur_items.length;
|
||||||
|
if (weight > 0)
|
||||||
|
{
|
||||||
|
item_map[item] = (item_map[item] || { target: 0, cur: [] });
|
||||||
|
item_map[item].target = weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const diff = (item) => (item_map[item].cur.length - item_map[item].target);
|
||||||
|
const most_underweighted = Object.keys(item_map)
|
||||||
|
.filter(item => item_map[item].target > 0)
|
||||||
|
.sort((a, b) => diff(a) - diff(b));
|
||||||
|
// Items with zero target weight MUST never be selected - remove them
|
||||||
|
// and remap each of them to a most underweighted item
|
||||||
|
for (const item in item_map)
|
||||||
|
{
|
||||||
|
if (!item_map[item].target)
|
||||||
|
{
|
||||||
|
const prev = item_map[item];
|
||||||
|
delete item_map[item];
|
||||||
|
for (const idx of prev.cur)
|
||||||
|
{
|
||||||
|
const move_to = most_underweighted[0];
|
||||||
|
item_map[move_to].cur.push(idx);
|
||||||
|
move_leftmost(most_underweighted, diff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Other over-weighted items are only moved if it improves the distribution
|
||||||
|
while (most_underweighted.length > 1)
|
||||||
|
{
|
||||||
|
const first = most_underweighted[0];
|
||||||
|
const last = most_underweighted[most_underweighted.length-1];
|
||||||
|
const first_diff = diff(first);
|
||||||
|
const last_diff = diff(last);
|
||||||
|
if (Math.abs(first_diff+1)+Math.abs(last_diff-1) < Math.abs(first_diff)+Math.abs(last_diff))
|
||||||
|
{
|
||||||
|
item_map[first].cur.push(item_map[last].cur.pop());
|
||||||
|
move_leftmost(most_underweighted, diff);
|
||||||
|
move_rightmost(most_underweighted, diff);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const new_items = new Array(cur_items.length);
|
||||||
|
for (const item in item_map)
|
||||||
|
{
|
||||||
|
for (const idx of item_map[item].cur)
|
||||||
|
{
|
||||||
|
new_items[idx] = item;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new_items;
|
||||||
|
}
|
||||||
|
|
||||||
|
function move_leftmost(sorted_array, diff)
|
||||||
|
{
|
||||||
|
// Re-sort by moving the leftmost item to the right if it changes position
|
||||||
|
const first = sorted_array[0];
|
||||||
|
const new_diff = diff(first);
|
||||||
|
let r = 0;
|
||||||
|
while (r < sorted_array.length-1 && diff(sorted_array[r+1]) <= new_diff)
|
||||||
|
r++;
|
||||||
|
if (r > 0)
|
||||||
|
{
|
||||||
|
for (let i = 0; i < r; i++)
|
||||||
|
sorted_array[i] = sorted_array[i+1];
|
||||||
|
sorted_array[r] = first;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function move_rightmost(sorted_array, diff)
|
||||||
|
{
|
||||||
|
// Re-sort by moving the rightmost item to the left if it changes position
|
||||||
|
const last = sorted_array[sorted_array.length-1];
|
||||||
|
const new_diff = diff(last);
|
||||||
|
let r = sorted_array.length-1;
|
||||||
|
while (r > 0 && diff(sorted_array[r-1]) > new_diff)
|
||||||
|
r--;
|
||||||
|
if (r < sorted_array.length-1)
|
||||||
|
{
|
||||||
|
for (let i = sorted_array.length-1; i > r; i--)
|
||||||
|
sorted_array[i] = sorted_array[i-1];
|
||||||
|
sorted_array[r] = last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// map previous PGs to folded nodes
|
||||||
|
function fold_prev_pgs(pgs, extracted_nodes)
|
||||||
|
{
|
||||||
|
const unmap = {};
|
||||||
|
for (const new_id in extracted_nodes)
|
||||||
|
{
|
||||||
|
for (const sub_node of extracted_nodes[new_id])
|
||||||
|
{
|
||||||
|
unmap[sub_node.id] = new_id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const mapped_pgs = [];
|
||||||
|
for (let i = 0; i < pgs.length; i++)
|
||||||
|
{
|
||||||
|
mapped_pgs.push(pgs[i].map(osd => (unmap[osd] || osd)));
|
||||||
|
}
|
||||||
|
return mapped_pgs;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
fold_failure_domains,
|
||||||
|
unfold_failure_domains,
|
||||||
|
adjust_distribution,
|
||||||
|
fold_prev_pgs,
|
||||||
|
};
|
|
@ -98,6 +98,7 @@ async function optimize_initial({ osd_weights, combinator, pg_count, pg_size = 3
|
||||||
score: lp_result.score,
|
score: lp_result.score,
|
||||||
weights: lp_result.vars,
|
weights: lp_result.vars,
|
||||||
int_pgs,
|
int_pgs,
|
||||||
|
pg_effsize,
|
||||||
space: eff * pg_effsize,
|
space: eff * pg_effsize,
|
||||||
total_space: total_weight,
|
total_space: total_weight,
|
||||||
};
|
};
|
||||||
|
@ -409,6 +410,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_weights, combinator
|
||||||
int_pgs: new_pgs,
|
int_pgs: new_pgs,
|
||||||
differs,
|
differs,
|
||||||
osd_differs,
|
osd_differs,
|
||||||
|
pg_effsize,
|
||||||
space: pg_effsize * pg_list_space_efficiency(new_pgs, osd_weights, pg_minsize, parity_space),
|
space: pg_effsize * pg_list_space_efficiency(new_pgs, osd_weights, pg_minsize, parity_space),
|
||||||
total_space: total_weight,
|
total_space: total_weight,
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
|
const assert = require('assert');
|
||||||
|
const { fold_failure_domains, unfold_failure_domains, adjust_distribution } = require('./fold.js');
|
||||||
|
const DSL = require('./dsl_pgs.js');
|
||||||
|
const LPOptimizer = require('./lp_optimizer.js');
|
||||||
|
const stableStringify = require('../stable-stringify.js');
|
||||||
|
|
||||||
|
async function run()
|
||||||
|
{
|
||||||
|
// Test run adjust_distribution
|
||||||
|
console.log('adjust_distribution');
|
||||||
|
const rand = [];
|
||||||
|
for (let i = 0; i < 100; i++)
|
||||||
|
{
|
||||||
|
rand.push(1 + Math.floor(10*Math.random()));
|
||||||
|
// or rand.push(0);
|
||||||
|
}
|
||||||
|
const adj = adjust_distribution({ 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1 }, rand);
|
||||||
|
//console.log(rand.join(' '));
|
||||||
|
console.log(rand.reduce((a, c) => { a[c] = (a[c]||0)+1; return a; }, {}));
|
||||||
|
//console.log(adj.join(' '));
|
||||||
|
console.log(adj.reduce((a, c) => { a[c] = (a[c]||0)+1; return a; }, {}));
|
||||||
|
console.log('Movement: '+rand.reduce((a, c, i) => a+(rand[i] != adj[i] ? 1 : 0), 0)+'/'+rand.length);
|
||||||
|
|
||||||
|
console.log('\nfold_failure_domains');
|
||||||
|
console.log(JSON.stringify(fold_failure_domains(
|
||||||
|
[
|
||||||
|
{ id: 1, level: 'osd', size: 1, parent: 'disk1' },
|
||||||
|
{ id: 2, level: 'osd', size: 2, parent: 'disk1' },
|
||||||
|
{ id: 'disk1', level: 'disk', parent: 'host1' },
|
||||||
|
{ id: 'host1', level: 'host', parent: 'dc1' },
|
||||||
|
{ id: 'dc1', level: 'dc' },
|
||||||
|
],
|
||||||
|
[ [ [ 'dc' ], [ 'host' ] ] ]
|
||||||
|
), 0, 2));
|
||||||
|
|
||||||
|
console.log('\nfold_failure_domains empty rules');
|
||||||
|
console.log(JSON.stringify(fold_failure_domains(
|
||||||
|
[
|
||||||
|
{ id: 1, level: 'osd', size: 1, parent: 'disk1' },
|
||||||
|
{ id: 2, level: 'osd', size: 2, parent: 'disk1' },
|
||||||
|
{ id: 'disk1', level: 'disk', parent: 'host1' },
|
||||||
|
{ id: 'host1', level: 'host', parent: 'dc1' },
|
||||||
|
{ id: 'dc1', level: 'dc' },
|
||||||
|
],
|
||||||
|
[]
|
||||||
|
), 0, 2));
|
||||||
|
|
||||||
|
console.log('\noptimize_folded');
|
||||||
|
// 5 DCs, 2 hosts per DC, 10 OSD per host
|
||||||
|
const nodes = [];
|
||||||
|
for (let i = 1; i <= 100; i++)
|
||||||
|
{
|
||||||
|
nodes.push({ id: i, level: 'osd', size: 1, parent: 'host'+(1+(0|((i-1)/10))) });
|
||||||
|
}
|
||||||
|
for (let i = 1; i <= 10; i++)
|
||||||
|
{
|
||||||
|
nodes.push({ id: 'host'+i, level: 'host', parent: 'dc'+(1+(0|((i-1)/2))) });
|
||||||
|
}
|
||||||
|
for (let i = 1; i <= 5; i++)
|
||||||
|
{
|
||||||
|
nodes.push({ id: 'dc'+i, level: 'dc' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check rules
|
||||||
|
const rules = DSL.parse_level_indexes({ dc: '112233', host: '123456' }, [ 'dc', 'host', 'osd' ]);
|
||||||
|
assert.deepEqual(rules, [[],[["dc","=",1],["host","!=",[1]]],[["dc","!=",[1]]],[["dc","=",3],["host","!=",[3]]],[["dc","!=",[1,3]]],[["dc","=",5],["host","!=",[5]]]]);
|
||||||
|
|
||||||
|
// Check tree folding
|
||||||
|
const { nodes: folded_nodes, leaves: folded_leaves } = fold_failure_domains(nodes, rules);
|
||||||
|
const expected_folded = [];
|
||||||
|
const expected_leaves = {};
|
||||||
|
for (let i = 1; i <= 10; i++)
|
||||||
|
{
|
||||||
|
expected_folded.push({ id: 100+i, name: 'host'+i, level: 'host', size: 10, parent: 'dc'+(1+(0|((i-1)/2))) });
|
||||||
|
expected_leaves[100+i] = [ ...new Array(10).keys() ].map(k => ({ id: 10*(i-1)+k+1, level: 'osd', size: 1, parent: 'host'+i }));
|
||||||
|
}
|
||||||
|
for (let i = 1; i <= 5; i++)
|
||||||
|
{
|
||||||
|
expected_folded.push({ id: 'dc'+i, level: 'dc' });
|
||||||
|
}
|
||||||
|
assert.equal(stableStringify(folded_nodes), stableStringify(expected_folded));
|
||||||
|
assert.equal(stableStringify(folded_leaves), stableStringify(expected_leaves));
|
||||||
|
|
||||||
|
// Now optimise it
|
||||||
|
console.log('1000 PGs, EC 112233');
|
||||||
|
const leaf_weights = folded_nodes.reduce((a, c) => { if (Number(c.id)) { a[c.id] = c.size; } return a; }, {});
|
||||||
|
let res = await LPOptimizer.optimize_initial({
|
||||||
|
osd_weights: leaf_weights,
|
||||||
|
combinator: new DSL.RuleCombinator(folded_nodes, rules, 10000, false),
|
||||||
|
pg_size: 6,
|
||||||
|
pg_count: 1000,
|
||||||
|
ordered: false,
|
||||||
|
});
|
||||||
|
LPOptimizer.print_change_stats(res, false);
|
||||||
|
assert.equal(res.space, 100, 'Initial distribution');
|
||||||
|
|
||||||
|
const unfolded_res = { ...res };
|
||||||
|
unfolded_res.int_pgs = unfold_failure_domains(res.int_pgs, null, folded_leaves);
|
||||||
|
const osd_weights = nodes.reduce((a, c) => { if (Number(c.id)) { a[c.id] = c.size; } return a; }, {});
|
||||||
|
unfolded_res.space = unfolded_res.pg_effsize * LPOptimizer.pg_list_space_efficiency(unfolded_res.int_pgs, osd_weights, 0, 1);
|
||||||
|
LPOptimizer.print_change_stats(unfolded_res, false);
|
||||||
|
assert.equal(res.space, 100, 'Initial distribution');
|
||||||
|
}
|
||||||
|
|
||||||
|
run().catch(console.error);
|
|
@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
|
||||||
const stat = state.osd.stats[osd_num];
|
const stat = state.osd.stats[osd_num];
|
||||||
const osd_cfg = state.config.osd[osd_num];
|
const osd_cfg = state.config.osd[osd_num];
|
||||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
||||||
if (reweight < 0 || isNaN(reweight))
|
if (isNaN(reweight) || reweight < 0 || reweight > 0)
|
||||||
reweight = 1;
|
reweight = 1;
|
||||||
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
||||||
osd_cfg && osd_cfg.noout))
|
osd_cfg && osd_cfg.noout))
|
||||||
|
@ -87,7 +87,7 @@ function make_hier_tree(global_config, tree)
|
||||||
tree[''] = { children: [] };
|
tree[''] = { children: [] };
|
||||||
for (const node_id in tree)
|
for (const node_id in tree)
|
||||||
{
|
{
|
||||||
if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
|
if (node_id === '' || !(tree[node_id].children||[]).length && (tree[node_id].size||0) <= 0)
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -107,10 +107,10 @@ function make_hier_tree(global_config, tree)
|
||||||
deleted = 0;
|
deleted = 0;
|
||||||
for (const node_id in tree)
|
for (const node_id in tree)
|
||||||
{
|
{
|
||||||
if (tree[node_id].level !== 'osd' && (!tree[node_id].children || !tree[node_id].children.length))
|
if (!(tree[node_id].children||[]).length && (tree[node_id].size||0) <= 0)
|
||||||
{
|
{
|
||||||
const parent = tree[node_id].parent;
|
const parent = tree[node_id].parent;
|
||||||
if (parent)
|
if (parent && tree[parent])
|
||||||
{
|
{
|
||||||
tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
|
tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "vitastor-mon",
|
"name": "vitastor-mon",
|
||||||
"version": "2.1.0",
|
"version": "2.2.0",
|
||||||
"description": "Vitastor SDS monitor service",
|
"description": "Vitastor SDS monitor service",
|
||||||
"main": "mon-main.js",
|
"main": "mon-main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
@ -19,6 +19,6 @@
|
||||||
"eslint-plugin-node": "^11.1.0"
|
"eslint-plugin-node": "^11.1.0"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=12.1.0"
|
"node": ">=12.0.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
const { RuleCombinator } = require('./lp_optimizer/dsl_pgs.js');
|
const { RuleCombinator } = require('./lp_optimizer/dsl_pgs.js');
|
||||||
const { SimpleCombinator, flatten_tree } = require('./lp_optimizer/simple_pgs.js');
|
const { SimpleCombinator, flatten_tree } = require('./lp_optimizer/simple_pgs.js');
|
||||||
|
const { fold_failure_domains, unfold_failure_domains, fold_prev_pgs } = require('./lp_optimizer/fold.js');
|
||||||
const { validate_pool_cfg, get_pg_rules } = require('./pool_config.js');
|
const { validate_pool_cfg, get_pg_rules } = require('./pool_config.js');
|
||||||
const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
|
const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
|
||||||
const { scale_pg_count } = require('./pg_utils.js');
|
const { scale_pg_count } = require('./pg_utils.js');
|
||||||
|
@ -160,7 +161,6 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
|
||||||
pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
|
pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
|
||||||
pool_cfg.immediate_commit || global_config.immediate_commit || 'all'
|
pool_cfg.immediate_commit || global_config.immediate_commit || 'all'
|
||||||
);
|
);
|
||||||
pool_tree = make_hier_tree(global_config, pool_tree);
|
|
||||||
// First try last_clean_pgs to minimize data movement
|
// First try last_clean_pgs to minimize data movement
|
||||||
let prev_pgs = [];
|
let prev_pgs = [];
|
||||||
for (const pg in ((state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
for (const pg in ((state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
||||||
|
@ -175,14 +175,19 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
|
||||||
prev_pgs[pg-1] = [ ...state.pg.config.items[pool_id][pg].osd_set ];
|
prev_pgs[pg-1] = [ ...state.pg.config.items[pool_id][pg].osd_set ];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
const use_rules = !global_config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement;
|
||||||
|
const rules = use_rules ? get_pg_rules(pool_id, pool_cfg, global_config.placement_levels) : null;
|
||||||
|
const folded = fold_failure_domains(Object.values(pool_tree), use_rules ? rules : [ [ [ pool_cfg.failure_domain ] ] ]);
|
||||||
|
// FIXME: Remove/merge make_hier_tree() step somewhere, however it's needed to remove empty nodes
|
||||||
|
const folded_tree = make_hier_tree(global_config, folded.nodes);
|
||||||
const old_pg_count = prev_pgs.length;
|
const old_pg_count = prev_pgs.length;
|
||||||
const optimize_cfg = {
|
const optimize_cfg = {
|
||||||
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
|
osd_weights: folded.nodes.reduce((a, c) => { if (Number(c.id)) { a[c.id] = c.size; } return a; }, {}),
|
||||||
combinator: !global_config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
|
combinator: use_rules
|
||||||
// new algorithm:
|
// new algorithm:
|
||||||
? new RuleCombinator(pool_tree, get_pg_rules(pool_id, pool_cfg, global_config.placement_levels), pool_cfg.max_osd_combinations)
|
? new RuleCombinator(folded_tree, rules, pool_cfg.max_osd_combinations)
|
||||||
// old algorithm:
|
// old algorithm:
|
||||||
: new SimpleCombinator(flatten_tree(pool_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
|
: new SimpleCombinator(flatten_tree(folded_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
|
||||||
pg_count: pool_cfg.pg_count,
|
pg_count: pool_cfg.pg_count,
|
||||||
pg_size: pool_cfg.pg_size,
|
pg_size: pool_cfg.pg_size,
|
||||||
pg_minsize: pool_cfg.pg_minsize,
|
pg_minsize: pool_cfg.pg_minsize,
|
||||||
|
@ -202,12 +207,11 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
|
||||||
for (const pg of prev_pgs)
|
for (const pg of prev_pgs)
|
||||||
{
|
{
|
||||||
while (pg.length < pool_cfg.pg_size)
|
while (pg.length < pool_cfg.pg_size)
|
||||||
{
|
|
||||||
pg.push(0);
|
pg.push(0);
|
||||||
}
|
}
|
||||||
}
|
const folded_prev_pgs = fold_prev_pgs(prev_pgs, folded.leaves);
|
||||||
optimize_result = await LPOptimizer.optimize_change({
|
optimize_result = await LPOptimizer.optimize_change({
|
||||||
prev_pgs,
|
prev_pgs: folded_prev_pgs,
|
||||||
...optimize_cfg,
|
...optimize_cfg,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -215,6 +219,10 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
|
||||||
{
|
{
|
||||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||||
}
|
}
|
||||||
|
optimize_result.int_pgs = unfold_failure_domains(optimize_result.int_pgs, prev_pgs, folded.leaves);
|
||||||
|
const osd_weights = Object.values(pool_tree).reduce((a, c) => { if (c.level === 'osd') { a[c.id] = c.size; } return a; }, {});
|
||||||
|
optimize_result.space = optimize_result.pg_effsize * LPOptimizer.pg_list_space_efficiency(optimize_result.int_pgs,
|
||||||
|
osd_weights, optimize_cfg.pg_minsize, 1);
|
||||||
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
|
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
|
||||||
LPOptimizer.print_change_stats(optimize_result);
|
LPOptimizer.print_change_stats(optimize_result);
|
||||||
let pg_effsize = pool_cfg.pg_size;
|
let pg_effsize = pool_cfg.pg_size;
|
||||||
|
|
|
@ -40,6 +40,11 @@ async function run()
|
||||||
console.log("/etc/systemd/system/vitastor-etcd.service already exists");
|
console.log("/etc/systemd/system/vitastor-etcd.service already exists");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
if (!in_docker && fs.existsSync("/etc/systemd/system/etcd.service"))
|
||||||
|
{
|
||||||
|
console.log("/etc/systemd/system/etcd.service already exists");
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
const config = JSON.parse(fs.readFileSync(config_path, { encoding: 'utf-8' }));
|
const config = JSON.parse(fs.readFileSync(config_path, { encoding: 'utf-8' }));
|
||||||
if (!config.etcd_address)
|
if (!config.etcd_address)
|
||||||
{
|
{
|
||||||
|
@ -66,7 +71,7 @@ async function run()
|
||||||
console.log('etcd for Vitastor configured. Run `systemctl enable --now vitastor-etcd` to start etcd');
|
console.log('etcd for Vitastor configured. Run `systemctl enable --now vitastor-etcd` to start etcd');
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
await system(`mkdir -p /var/lib/etcd`);
|
await system(`mkdir -p /var/lib/etcd/vitastor`);
|
||||||
fs.writeFileSync(
|
fs.writeFileSync(
|
||||||
"/etc/systemd/system/vitastor-etcd.service",
|
"/etc/systemd/system/vitastor-etcd.service",
|
||||||
`[Unit]
|
`[Unit]
|
||||||
|
@ -77,14 +82,14 @@ Wants=network-online.target local-fs.target time-sync.target
|
||||||
[Service]
|
[Service]
|
||||||
Restart=always
|
Restart=always
|
||||||
Environment=GOGC=50
|
Environment=GOGC=50
|
||||||
ExecStart=etcd --name ${etcd_name} --data-dir /var/lib/etcd \\
|
ExecStart=etcd --name ${etcd_name} --data-dir /var/lib/etcd/vitastor \\
|
||||||
--snapshot-count 10000 --advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
|
--snapshot-count 10000 --advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
|
||||||
--initial-advertise-peer-urls http://${etcds[num]}:2380 --listen-peer-urls http://${etcds[num]}:2380 \\
|
--initial-advertise-peer-urls http://${etcds[num]}:2380 --listen-peer-urls http://${etcds[num]}:2380 \\
|
||||||
--initial-cluster-token vitastor-etcd-1 --initial-cluster ${etcd_cluster} \\
|
--initial-cluster-token vitastor-etcd-1 --initial-cluster ${etcd_cluster} \\
|
||||||
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
|
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
|
||||||
--auto-compaction-retention=10 --auto-compaction-mode=revision
|
--auto-compaction-retention=10 --auto-compaction-mode=revision
|
||||||
WorkingDirectory=/var/lib/etcd
|
WorkingDirectory=/var/lib/etcd/vitastor
|
||||||
ExecStartPre=+chown -R etcd /var/lib/etcd
|
ExecStartPre=+chown -R etcd /var/lib/etcd/vitastor
|
||||||
User=etcd
|
User=etcd
|
||||||
PrivateTmp=false
|
PrivateTmp=false
|
||||||
TasksMax=infinity
|
TasksMax=infinity
|
||||||
|
@ -97,8 +102,9 @@ WantedBy=multi-user.target
|
||||||
`);
|
`);
|
||||||
await system(`useradd etcd`);
|
await system(`useradd etcd`);
|
||||||
await system(`systemctl daemon-reload`);
|
await system(`systemctl daemon-reload`);
|
||||||
await system(`systemctl enable etcd`);
|
// Disable distribution etcd unit and enable our one
|
||||||
await system(`systemctl start etcd`);
|
await system(`systemctl disable --now etcd`);
|
||||||
|
await system(`systemctl enable --now vitastor-etcd`);
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
24
mon/stats.js
24
mon/stats.js
|
@ -87,11 +87,25 @@ function sum_op_stats(all_osd, prev_stats)
|
||||||
for (const k in derived[type][op])
|
for (const k in derived[type][op])
|
||||||
{
|
{
|
||||||
sum_diff[type][op] = sum_diff[type][op] || {};
|
sum_diff[type][op] = sum_diff[type][op] || {};
|
||||||
|
if (k == 'lat')
|
||||||
|
sum_diff[type][op].lat = (sum_diff[type][op].lat || 0n) + derived[type][op].lat*derived[type][op].iops;
|
||||||
|
else
|
||||||
sum_diff[type][op][k] = (sum_diff[type][op][k] || 0n) + derived[type][op][k];
|
sum_diff[type][op][k] = (sum_diff[type][op][k] || 0n) + derived[type][op][k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Calculate average (weighted by iops) op latency across all OSDs
|
||||||
|
for (const type in sum_diff)
|
||||||
|
{
|
||||||
|
for (const op in sum_diff[type])
|
||||||
|
{
|
||||||
|
if (sum_diff[type][op].lat)
|
||||||
|
{
|
||||||
|
sum_diff[type][op].lat /= sum_diff[type][op].iops;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return sum_diff;
|
return sum_diff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -271,8 +285,7 @@ function sum_inode_stats(state, prev_stats)
|
||||||
const op_st = inode_stats[pool_id][inode_num][op];
|
const op_st = inode_stats[pool_id][inode_num][op];
|
||||||
op_st.bps += op_diff.bps;
|
op_st.bps += op_diff.bps;
|
||||||
op_st.iops += op_diff.iops;
|
op_st.iops += op_diff.iops;
|
||||||
op_st.lat += op_diff.lat;
|
op_st.lat += op_diff.lat*op_diff.iops;
|
||||||
op_st.n_osd = (op_st.n_osd || 0) + 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -285,11 +298,8 @@ function sum_inode_stats(state, prev_stats)
|
||||||
for (const op of [ 'read', 'write', 'delete' ])
|
for (const op of [ 'read', 'write', 'delete' ])
|
||||||
{
|
{
|
||||||
const op_st = inode_stats[pool_id][inode_num][op];
|
const op_st = inode_stats[pool_id][inode_num][op];
|
||||||
if (op_st.n_osd)
|
if (op_st.lat)
|
||||||
{
|
op_st.lat /= op_st.iops;
|
||||||
op_st.lat /= BigInt(op_st.n_osd);
|
|
||||||
delete op_st.n_osd;
|
|
||||||
}
|
|
||||||
if (op_st.bps > 0 || op_st.iops > 0)
|
if (op_st.bps > 0 || op_st.iops > 0)
|
||||||
nonzero = true;
|
nonzero = true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "vitastor",
|
"name": "vitastor",
|
||||||
"version": "2.1.0",
|
"version": "2.2.0",
|
||||||
"description": "Low-level native bindings to Vitastor client library",
|
"description": "Low-level native bindings to Vitastor client library",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
|
|
|
@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VITASTOR_VERSION = '2.1.0'
|
VITASTOR_VERSION = '2.2.0'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 2.1.0
|
Version: 2.2.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-2.1.0.el7.tar.gz
|
Source0: vitastor-2.2.0.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 2.1.0
|
Version: 2.2.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-2.1.0.el8.tar.gz
|
Source0: vitastor-2.2.0.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 2.1.0
|
Version: 2.2.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-2.1.0.el9.tar.gz
|
Source0: vitastor-2.2.0.el9.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVITASTOR_VERSION="2.1.0")
|
add_definitions(-DVITASTOR_VERSION="2.2.0")
|
||||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
add_link_options(-fno-omit-frame-pointer)
|
add_link_options(-fno-omit-frame-pointer)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
|
|
|
@ -266,6 +266,8 @@ class blockstore_impl_t
|
||||||
int throttle_threshold_us = 50;
|
int throttle_threshold_us = 50;
|
||||||
// Maximum writes between automatically added fsync operations
|
// Maximum writes between automatically added fsync operations
|
||||||
uint64_t autosync_writes = 128;
|
uint64_t autosync_writes = 128;
|
||||||
|
// Log level (0-10)
|
||||||
|
int log_level = 0;
|
||||||
/******* END OF OPTIONS *******/
|
/******* END OF OPTIONS *******/
|
||||||
|
|
||||||
struct ring_consumer_t ring_consumer;
|
struct ring_consumer_t ring_consumer;
|
||||||
|
|
|
@ -113,10 +113,13 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
||||||
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
|
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
|
||||||
{
|
{
|
||||||
// No space in the journal. Wait until used_start changes.
|
// No space in the journal. Wait until used_start changes.
|
||||||
|
if (bs->log_level > 5)
|
||||||
|
{
|
||||||
printf(
|
printf(
|
||||||
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
||||||
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
||||||
);
|
);
|
||||||
|
}
|
||||||
PRIV(op)->wait_for = WAIT_JOURNAL;
|
PRIV(op)->wait_for = WAIT_JOURNAL;
|
||||||
bs->flusher->request_trim();
|
bs->flusher->request_trim();
|
||||||
PRIV(op)->wait_detail = bs->journal.used_start;
|
PRIV(op)->wait_detail = bs->journal.used_start;
|
||||||
|
|
|
@ -101,6 +101,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||||
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
||||||
config["inmemory_journal"] != "no";
|
config["inmemory_journal"] != "no";
|
||||||
|
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
||||||
// Validate
|
// Validate
|
||||||
if (journal.sector_count < 2)
|
if (journal.sector_count < 2)
|
||||||
{
|
{
|
||||||
|
|
|
@ -93,7 +93,7 @@ add_executable(test_cluster_client
|
||||||
EXCLUDE_FROM_ALL
|
EXCLUDE_FROM_ALL
|
||||||
../test/test_cluster_client.cpp
|
../test/test_cluster_client.cpp
|
||||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp ../test/mock/messenger.cpp msgr_stop.cpp
|
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp ../test/mock/messenger.cpp msgr_stop.cpp
|
||||||
etcd_state_client.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp
|
etcd_state_client.cpp ../util/timerfd_manager.cpp ../util/addr_util.cpp ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp
|
||||||
)
|
)
|
||||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||||
target_include_directories(test_cluster_client BEFORE PUBLIC ${CMAKE_SOURCE_DIR}/src/test/mock)
|
target_include_directories(test_cluster_client BEFORE PUBLIC ${CMAKE_SOURCE_DIR}/src/test/mock)
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include "pg_states.h"
|
||||||
#include "cluster_client_impl.h"
|
#include "cluster_client_impl.h"
|
||||||
#include "json_util.h"
|
#include "json_util.h"
|
||||||
|
|
||||||
|
@ -57,6 +58,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||||
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
|
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
|
||||||
st_cli.on_change_pool_config_hook = [this]() { on_change_pool_config_hook(); };
|
st_cli.on_change_pool_config_hook = [this]() { on_change_pool_config_hook(); };
|
||||||
st_cli.on_change_pg_state_hook = [this](pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary) { on_change_pg_state_hook(pool_id, pg_num, prev_primary); };
|
st_cli.on_change_pg_state_hook = [this](pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary) { on_change_pg_state_hook(pool_id, pg_num, prev_primary); };
|
||||||
|
st_cli.on_change_node_placement_hook = [this]() { on_change_node_placement_hook(); };
|
||||||
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
|
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
|
||||||
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
|
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
|
||||||
|
|
||||||
|
@ -470,11 +472,95 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
||||||
}
|
}
|
||||||
// log_level
|
// log_level
|
||||||
log_level = config["log_level"].uint64_value();
|
log_level = config["log_level"].uint64_value();
|
||||||
|
// hostname
|
||||||
|
conf_hostname = config["hostname"].string_value();
|
||||||
|
auto new_hostname = conf_hostname != "" ? conf_hostname : gethostname_str();
|
||||||
|
if (new_hostname != client_hostname)
|
||||||
|
{
|
||||||
|
self_tree_metrics.clear();
|
||||||
|
client_hostname = new_hostname;
|
||||||
|
}
|
||||||
msgr.parse_config(config);
|
msgr.parse_config(config);
|
||||||
st_cli.parse_config(config);
|
st_cli.parse_config(config);
|
||||||
st_cli.load_pgs();
|
st_cli.load_pgs();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
osd_num_t cluster_client_t::select_random_osd(const std::vector<osd_num_t> & osds)
|
||||||
|
{
|
||||||
|
osd_num_t alive_set[osds.size()];
|
||||||
|
int alive_count = 0;
|
||||||
|
for (auto & osd_num: osds)
|
||||||
|
{
|
||||||
|
if (!st_cli.peer_states[osd_num].is_null())
|
||||||
|
alive_set[alive_count++] = osd_num;
|
||||||
|
}
|
||||||
|
if (!alive_count)
|
||||||
|
return 0;
|
||||||
|
return alive_set[lrand48() % alive_count];
|
||||||
|
}
|
||||||
|
|
||||||
|
osd_num_t cluster_client_t::select_nearest_osd(const std::vector<osd_num_t> & osds)
|
||||||
|
{
|
||||||
|
if (!self_tree_metrics.size())
|
||||||
|
{
|
||||||
|
std::string cur_id = client_hostname;
|
||||||
|
int metric = 0;
|
||||||
|
while (self_tree_metrics.find(cur_id) == self_tree_metrics.end())
|
||||||
|
{
|
||||||
|
self_tree_metrics[cur_id] = metric++;
|
||||||
|
json11::Json cur_placement = st_cli.node_placement[cur_id];
|
||||||
|
cur_id = cur_placement["parent"].string_value();
|
||||||
|
}
|
||||||
|
if (cur_id != "")
|
||||||
|
{
|
||||||
|
self_tree_metrics[""] = metric++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
osd_num_t best_osd = 0;
|
||||||
|
int best_metric = -1;
|
||||||
|
for (auto & osd_num: osds)
|
||||||
|
{
|
||||||
|
int metric = -1;
|
||||||
|
auto met_it = osd_tree_metrics.find(osd_num);
|
||||||
|
if (met_it != osd_tree_metrics.end())
|
||||||
|
{
|
||||||
|
metric = met_it->second;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto & peer_state = st_cli.peer_states[osd_num];
|
||||||
|
if (!peer_state.is_null())
|
||||||
|
{
|
||||||
|
metric = self_tree_metrics[""];
|
||||||
|
bool first = true;
|
||||||
|
std::string cur_id = std::to_string(osd_num);
|
||||||
|
std::set<std::string> seen;
|
||||||
|
while (seen.find(cur_id) == seen.end())
|
||||||
|
{
|
||||||
|
seen.insert(cur_id);
|
||||||
|
json11::Json cur_placement = st_cli.node_placement[cur_id];
|
||||||
|
std::string cur_parent = cur_placement["parent"].string_value();
|
||||||
|
cur_id = (!first || cur_parent != "" ? cur_parent : peer_state["host"].string_value());
|
||||||
|
first = false;
|
||||||
|
auto self_it = self_tree_metrics.find(cur_id);
|
||||||
|
if (self_it != self_tree_metrics.end())
|
||||||
|
{
|
||||||
|
metric = self_it->second;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
osd_tree_metrics[osd_num] = metric;
|
||||||
|
}
|
||||||
|
if (metric >= 0 && (best_metric < 0 || metric < best_metric))
|
||||||
|
{
|
||||||
|
best_metric = metric;
|
||||||
|
best_osd = osd_num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return best_osd;
|
||||||
|
}
|
||||||
|
|
||||||
void cluster_client_t::on_load_pgs_hook(bool success)
|
void cluster_client_t::on_load_pgs_hook(bool success)
|
||||||
{
|
{
|
||||||
for (auto pool_item: st_cli.pool_config)
|
for (auto pool_item: st_cli.pool_config)
|
||||||
|
@ -546,6 +632,7 @@ bool cluster_client_t::get_immediate_commit(uint64_t inode)
|
||||||
|
|
||||||
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
|
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
|
||||||
{
|
{
|
||||||
|
osd_tree_metrics.erase(peer_osd);
|
||||||
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
|
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
|
||||||
{
|
{
|
||||||
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||||
|
@ -553,6 +640,12 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cluster_client_t::on_change_node_placement_hook()
|
||||||
|
{
|
||||||
|
osd_tree_metrics.clear();
|
||||||
|
self_tree_metrics.clear();
|
||||||
|
}
|
||||||
|
|
||||||
bool cluster_client_t::is_ready()
|
bool cluster_client_t::is_ready()
|
||||||
{
|
{
|
||||||
return pgs_loaded;
|
return pgs_loaded;
|
||||||
|
@ -1221,6 +1314,17 @@ int cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
!pg_it->second.pause && pg_it->second.cur_primary)
|
!pg_it->second.pause && pg_it->second.cur_primary)
|
||||||
{
|
{
|
||||||
osd_num_t primary_osd = pg_it->second.cur_primary;
|
osd_num_t primary_osd = pg_it->second.cur_primary;
|
||||||
|
if (pool_cfg.local_reads != POOL_LOCAL_READ_PRIMARY &&
|
||||||
|
pool_cfg.scheme == POOL_SCHEME_REPLICATED &&
|
||||||
|
(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
|
||||||
|
(pg_it->second.cur_state == PG_ACTIVE || pg_it->second.cur_state == (PG_ACTIVE|PG_LEFT_ON_DEAD)))
|
||||||
|
{
|
||||||
|
osd_num_t nearest_osd = pool_cfg.local_reads == POOL_LOCAL_READ_NEAREST
|
||||||
|
? select_nearest_osd(pg_it->second.target_set)
|
||||||
|
: select_random_osd(pg_it->second.target_set);
|
||||||
|
if (nearest_osd)
|
||||||
|
primary_osd = nearest_osd;
|
||||||
|
}
|
||||||
part->osd_num = primary_osd;
|
part->osd_num = primary_osd;
|
||||||
auto peer_it = msgr.osd_peer_fds.find(primary_osd);
|
auto peer_it = msgr.osd_peer_fds.find(primary_osd);
|
||||||
if (peer_it != msgr.osd_peer_fds.end())
|
if (peer_it != msgr.osd_peer_fds.end())
|
||||||
|
@ -1244,7 +1348,6 @@ int cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
.req = { .rw = {
|
.req = { .rw = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = next_op_id(),
|
|
||||||
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
|
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
|
||||||
},
|
},
|
||||||
.inode = op->cur_inode,
|
.inode = op->cur_inode,
|
||||||
|
@ -1353,7 +1456,6 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
||||||
.req = {
|
.req = {
|
||||||
.hdr = {
|
.hdr = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = next_op_id(),
|
|
||||||
.opcode = OSD_OP_SYNC,
|
.opcode = OSD_OP_SYNC,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -1498,8 +1600,3 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
||||||
part_len--;
|
part_len--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t cluster_client_t::next_op_id()
|
|
||||||
{
|
|
||||||
return msgr.next_subop_id++;
|
|
||||||
}
|
|
||||||
|
|
|
@ -86,8 +86,8 @@ class cluster_client_t
|
||||||
#ifdef __MOCK__
|
#ifdef __MOCK__
|
||||||
public:
|
public:
|
||||||
#endif
|
#endif
|
||||||
timerfd_manager_t *tfd;
|
timerfd_manager_t *tfd = NULL;
|
||||||
ring_loop_t *ringloop;
|
ring_loop_t *ringloop = NULL;
|
||||||
|
|
||||||
std::map<pool_id_t, uint64_t> pg_counts;
|
std::map<pool_id_t, uint64_t> pg_counts;
|
||||||
std::map<pool_pg_num_t, osd_num_t> pg_primary;
|
std::map<pool_pg_num_t, osd_num_t> pg_primary;
|
||||||
|
@ -100,6 +100,7 @@ public:
|
||||||
uint64_t client_max_buffered_bytes = 0;
|
uint64_t client_max_buffered_bytes = 0;
|
||||||
uint64_t client_max_buffered_ops = 0;
|
uint64_t client_max_buffered_ops = 0;
|
||||||
uint64_t client_max_writeback_iodepth = 0;
|
uint64_t client_max_writeback_iodepth = 0;
|
||||||
|
std::string conf_hostname;
|
||||||
|
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
int client_retry_interval = 50; // ms
|
int client_retry_interval = 50; // ms
|
||||||
|
@ -107,6 +108,10 @@ public:
|
||||||
bool client_retry_enospc = true;
|
bool client_retry_enospc = true;
|
||||||
int client_wait_up_timeout = 16; // sec (for listings)
|
int client_wait_up_timeout = 16; // sec (for listings)
|
||||||
|
|
||||||
|
std::string client_hostname;
|
||||||
|
std::map<std::string, int> self_tree_metrics;
|
||||||
|
std::map<osd_num_t, int> osd_tree_metrics;
|
||||||
|
|
||||||
int retry_timeout_id = -1;
|
int retry_timeout_id = -1;
|
||||||
int retry_timeout_duration = 0;
|
int retry_timeout_duration = 0;
|
||||||
std::vector<cluster_op_t*> offline_ops;
|
std::vector<cluster_op_t*> offline_ops;
|
||||||
|
@ -152,7 +157,6 @@ public:
|
||||||
|
|
||||||
//inline uint32_t get_bs_bitmap_granularity() { return st_cli.global_bitmap_granularity; }
|
//inline uint32_t get_bs_bitmap_granularity() { return st_cli.global_bitmap_granularity; }
|
||||||
//inline uint64_t get_bs_block_size() { return st_cli.global_block_size; }
|
//inline uint64_t get_bs_block_size() { return st_cli.global_block_size; }
|
||||||
uint64_t next_op_id();
|
|
||||||
|
|
||||||
#ifndef __MOCK__
|
#ifndef __MOCK__
|
||||||
protected:
|
protected:
|
||||||
|
@ -162,11 +166,14 @@ protected:
|
||||||
protected:
|
protected:
|
||||||
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
||||||
bool affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num);
|
bool affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num);
|
||||||
|
|
||||||
void on_load_config_hook(json11::Json::object & config);
|
void on_load_config_hook(json11::Json::object & config);
|
||||||
void on_load_pgs_hook(bool success);
|
void on_load_pgs_hook(bool success);
|
||||||
void on_change_pool_config_hook();
|
void on_change_pool_config_hook();
|
||||||
void on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary);
|
void on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary);
|
||||||
void on_change_osd_state_hook(uint64_t peer_osd);
|
void on_change_osd_state_hook(uint64_t peer_osd);
|
||||||
|
void on_change_node_placement_hook();
|
||||||
|
|
||||||
void execute_internal(cluster_op_t *op);
|
void execute_internal(cluster_op_t *op);
|
||||||
void unshift_op(cluster_op_t *op);
|
void unshift_op(cluster_op_t *op);
|
||||||
int continue_rw(cluster_op_t *op);
|
int continue_rw(cluster_op_t *op);
|
||||||
|
@ -192,5 +199,8 @@ protected:
|
||||||
bool check_finish_listing(inode_list_t *lst);
|
bool check_finish_listing(inode_list_t *lst);
|
||||||
void continue_raw_ops(osd_num_t peer_osd);
|
void continue_raw_ops(osd_num_t peer_osd);
|
||||||
|
|
||||||
|
osd_num_t select_random_osd(const std::vector<osd_num_t> & osds);
|
||||||
|
osd_num_t select_nearest_osd(const std::vector<osd_num_t> & osds);
|
||||||
|
|
||||||
friend class writeback_cache_t;
|
friend class writeback_cache_t;
|
||||||
};
|
};
|
||||||
|
|
|
@ -342,7 +342,6 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||||
.sec_list = {
|
.sec_list = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = next_op_id(),
|
|
||||||
.opcode = OSD_OP_SEC_LIST,
|
.opcode = OSD_OP_SEC_LIST,
|
||||||
},
|
},
|
||||||
.list_pg = cur_list->pg->pg_num,
|
.list_pg = cur_list->pg->pg_num,
|
||||||
|
|
|
@ -922,6 +922,19 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
pc.used_for_app = "fs:"+pc.used_for_app;
|
pc.used_for_app = "fs:"+pc.used_for_app;
|
||||||
else
|
else
|
||||||
pc.used_for_app = pool_item.second["used_for_app"].as_string();
|
pc.used_for_app = pool_item.second["used_for_app"].as_string();
|
||||||
|
// Local Read Configuration
|
||||||
|
std::string local_reads = pool_item.second["local_reads"].string_value();
|
||||||
|
if (local_reads == "nearest")
|
||||||
|
pc.local_reads = POOL_LOCAL_READ_NEAREST;
|
||||||
|
else if (local_reads == "random")
|
||||||
|
pc.local_reads = POOL_LOCAL_READ_RANDOM;
|
||||||
|
else if (local_reads == "" || local_reads == "primary")
|
||||||
|
pc.local_reads = POOL_LOCAL_READ_PRIMARY;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pc.local_reads = POOL_LOCAL_READ_PRIMARY;
|
||||||
|
fprintf(stderr, "Warning: Pool %u has invalid local_reads, using 'primary'\n", pool_id);
|
||||||
|
}
|
||||||
// Immediate Commit Mode
|
// Immediate Commit Mode
|
||||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||||
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
|
||||||
|
@ -1256,6 +1269,13 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (key == etcd_prefix+"/config/node_placement")
|
||||||
|
{
|
||||||
|
// <etcd_prefix>/config/node_placement
|
||||||
|
node_placement = value;
|
||||||
|
if (on_change_node_placement_hook)
|
||||||
|
on_change_node_placement_hook();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t etcd_state_client_t::parse_immediate_commit(const std::string & immediate_commit_str, uint32_t default_value)
|
uint32_t etcd_state_client_t::parse_immediate_commit(const std::string & immediate_commit_str, uint32_t default_value)
|
||||||
|
|
|
@ -25,6 +25,10 @@
|
||||||
#define IMMEDIATE_ALL 2
|
#define IMMEDIATE_ALL 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define POOL_LOCAL_READ_PRIMARY 0
|
||||||
|
#define POOL_LOCAL_READ_NEAREST 1
|
||||||
|
#define POOL_LOCAL_READ_RANDOM 2
|
||||||
|
|
||||||
struct etcd_kv_t
|
struct etcd_kv_t
|
||||||
{
|
{
|
||||||
std::string key;
|
std::string key;
|
||||||
|
@ -48,21 +52,22 @@ struct pg_config_t
|
||||||
|
|
||||||
struct pool_config_t
|
struct pool_config_t
|
||||||
{
|
{
|
||||||
bool exists;
|
bool exists = false;
|
||||||
pool_id_t id;
|
pool_id_t id = 0;
|
||||||
std::string name;
|
std::string name;
|
||||||
uint64_t scheme;
|
uint64_t scheme = 0;
|
||||||
uint64_t pg_size, pg_minsize, parity_chunks;
|
uint64_t pg_size = 0, pg_minsize = 0, parity_chunks = 0;
|
||||||
uint32_t data_block_size, bitmap_granularity, immediate_commit;
|
uint32_t data_block_size = 0, bitmap_granularity = 0, immediate_commit = 0;
|
||||||
uint64_t pg_count;
|
uint64_t pg_count = 0;
|
||||||
uint64_t real_pg_count;
|
uint64_t real_pg_count = 0;
|
||||||
std::string failure_domain;
|
std::string failure_domain;
|
||||||
uint64_t max_osd_combinations;
|
uint64_t max_osd_combinations = 0;
|
||||||
uint64_t pg_stripe_size;
|
uint64_t pg_stripe_size = 0;
|
||||||
std::map<pg_num_t, pg_config_t> pg_config;
|
std::map<pg_num_t, pg_config_t> pg_config;
|
||||||
uint64_t scrub_interval;
|
uint64_t scrub_interval = 0;
|
||||||
std::string used_for_app;
|
std::string used_for_app;
|
||||||
int backfillfull;
|
int backfillfull = 0;
|
||||||
|
int local_reads = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct inode_config_t
|
struct inode_config_t
|
||||||
|
@ -130,6 +135,7 @@ public:
|
||||||
std::set<osd_num_t> seen_peers;
|
std::set<osd_num_t> seen_peers;
|
||||||
std::map<inode_t, inode_config_t> inode_config;
|
std::map<inode_t, inode_config_t> inode_config;
|
||||||
std::map<std::string, inode_t> inode_by_name;
|
std::map<std::string, inode_t> inode_by_name;
|
||||||
|
json11::Json node_placement;
|
||||||
|
|
||||||
std::function<void(std::map<std::string, etcd_kv_t> &)> on_change_hook;
|
std::function<void(std::map<std::string, etcd_kv_t> &)> on_change_hook;
|
||||||
std::function<void(json11::Json::object &)> on_load_config_hook;
|
std::function<void(json11::Json::object &)> on_load_config_hook;
|
||||||
|
@ -140,6 +146,7 @@ public:
|
||||||
std::function<void(pool_id_t, pg_num_t, osd_num_t)> on_change_pg_state_hook;
|
std::function<void(pool_id_t, pg_num_t, osd_num_t)> on_change_pg_state_hook;
|
||||||
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
|
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
|
||||||
std::function<void(osd_num_t)> on_change_osd_state_hook;
|
std::function<void(osd_num_t)> on_change_osd_state_hook;
|
||||||
|
std::function<void()> on_change_node_placement_hook;
|
||||||
std::function<void()> on_reload_hook;
|
std::function<void()> on_reload_hook;
|
||||||
std::function<void(inode_t, bool)> on_inode_change_hook;
|
std::function<void(inode_t, bool)> on_inode_change_hook;
|
||||||
std::function<void(http_co_t *)> on_start_watcher_hook;
|
std::function<void(http_co_t *)> on_start_watcher_hook;
|
||||||
|
|
|
@ -167,6 +167,10 @@ void osd_messenger_t::init()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
if (ringloop)
|
||||||
|
{
|
||||||
|
has_sendmsg_zc = ringloop->has_sendmsg_zc();
|
||||||
|
}
|
||||||
if (ringloop && iothread_count > 0)
|
if (ringloop && iothread_count > 0)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < iothread_count; i++)
|
for (int i = 0; i < iothread_count; i++)
|
||||||
|
@ -213,7 +217,6 @@ void osd_messenger_t::init()
|
||||||
op->req = (osd_any_op_t){
|
op->req = (osd_any_op_t){
|
||||||
.hdr = {
|
.hdr = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = this->next_subop_id++,
|
|
||||||
.opcode = OSD_OP_PING,
|
.opcode = OSD_OP_PING,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -329,6 +332,9 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
this->receive_buffer_size = 65536;
|
this->receive_buffer_size = 65536;
|
||||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||||
config["use_sync_send_recv"].uint64_value();
|
config["use_sync_send_recv"].uint64_value();
|
||||||
|
this->min_zerocopy_send_size = config["min_zerocopy_send_size"].is_null()
|
||||||
|
? DEFAULT_MIN_ZEROCOPY_SEND_SIZE
|
||||||
|
: (int)config["min_zerocopy_send_size"].int64_value();
|
||||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||||
if (!this->peer_connect_interval)
|
if (!this->peer_connect_interval)
|
||||||
this->peer_connect_interval = 5;
|
this->peer_connect_interval = 5;
|
||||||
|
@ -622,13 +628,19 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
.show_conf = {
|
.show_conf = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = this->next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SHOW_CONFIG,
|
.opcode = OSD_OP_SHOW_CONFIG,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
json11::Json::object payload;
|
||||||
|
if (osd_num)
|
||||||
|
{
|
||||||
|
// Inform that we're OSD <osd_num>
|
||||||
|
payload["osd_num"] = osd_num;
|
||||||
|
}
|
||||||
|
payload["features"] = json11::Json::object{ { "check_sequencing", true } };
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
if (rdma_contexts.size())
|
if (!use_rdmacm && rdma_contexts.size())
|
||||||
{
|
{
|
||||||
// Choose the right context for the selected network
|
// Choose the right context for the selected network
|
||||||
msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl);
|
msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl);
|
||||||
|
@ -642,19 +654,20 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
cl->rdma_conn = msgr_rdma_connection_t::create(selected_ctx, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
|
cl->rdma_conn = msgr_rdma_connection_t::create(selected_ctx, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
|
||||||
if (cl->rdma_conn)
|
if (cl->rdma_conn)
|
||||||
{
|
{
|
||||||
json11::Json payload = json11::Json::object {
|
payload["connect_rdma"] = cl->rdma_conn->addr.to_string();
|
||||||
{ "connect_rdma", cl->rdma_conn->addr.to_string() },
|
payload["rdma_max_msg"] = cl->rdma_conn->max_msg;
|
||||||
{ "rdma_max_msg", cl->rdma_conn->max_msg },
|
}
|
||||||
};
|
}
|
||||||
std::string payload_str = payload.dump();
|
}
|
||||||
|
#endif
|
||||||
|
if (payload.size())
|
||||||
|
{
|
||||||
|
std::string payload_str = json11::Json(payload).dump();
|
||||||
op->req.show_conf.json_len = payload_str.size();
|
op->req.show_conf.json_len = payload_str.size();
|
||||||
op->buf = malloc_or_die(payload_str.size());
|
op->buf = malloc_or_die(payload_str.size());
|
||||||
op->iov.push_back(op->buf, payload_str.size());
|
op->iov.push_back(op->buf, payload_str.size());
|
||||||
memcpy(op->buf, payload_str.c_str(), payload_str.size());
|
memcpy(op->buf, payload_str.c_str(), payload_str.size());
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
op->callback = [this, cl](osd_op_t *op)
|
op->callback = [this, cl](osd_op_t *op)
|
||||||
{
|
{
|
||||||
std::string json_err;
|
std::string json_err;
|
||||||
|
@ -701,7 +714,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
if (cl->rdma_conn && config["rdma_address"].is_string())
|
if (!use_rdmacm && cl->rdma_conn && config["rdma_address"].is_string())
|
||||||
{
|
{
|
||||||
msgr_rdma_address_t addr;
|
msgr_rdma_address_t addr;
|
||||||
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
|
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
|
||||||
|
@ -760,12 +773,15 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
||||||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
int one = 1;
|
int one = 1;
|
||||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||||
clients[peer_fd] = new osd_client_t();
|
auto cl = new osd_client_t();
|
||||||
clients[peer_fd]->peer_addr = addr;
|
clients[peer_fd] = cl;
|
||||||
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
cl->is_incoming = true;
|
||||||
clients[peer_fd]->peer_fd = peer_fd;
|
cl->peer_addr = addr;
|
||||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
cl->peer_addr = addr;
|
||||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
cl->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||||
|
cl->peer_fd = peer_fd;
|
||||||
|
cl->peer_state = PEER_CONNECTED;
|
||||||
|
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||||
// Add FD to epoll
|
// Add FD to epoll
|
||||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
|
@ -800,7 +816,8 @@ bool osd_messenger_t::is_rdma_enabled()
|
||||||
{
|
{
|
||||||
return rdma_contexts.size() > 0;
|
return rdma_contexts.size() > 0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
bool osd_messenger_t::is_use_rdmacm()
|
bool osd_messenger_t::is_use_rdmacm()
|
||||||
{
|
{
|
||||||
return use_rdmacm;
|
return use_rdmacm;
|
||||||
|
@ -896,6 +913,7 @@ static const char* local_only_params[] = {
|
||||||
"tcp_header_buffer_size",
|
"tcp_header_buffer_size",
|
||||||
"use_rdma",
|
"use_rdma",
|
||||||
"use_sync_send_recv",
|
"use_sync_send_recv",
|
||||||
|
"min_zerocopy_send_size",
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
|
static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
|
||||||
|
|
|
@ -32,6 +32,8 @@
|
||||||
|
|
||||||
#define VITASTOR_CONFIG_PATH "/etc/vitastor/vitastor.conf"
|
#define VITASTOR_CONFIG_PATH "/etc/vitastor/vitastor.conf"
|
||||||
|
|
||||||
|
#define DEFAULT_MIN_ZEROCOPY_SEND_SIZE 32*1024
|
||||||
|
|
||||||
#define MSGR_SENDP_HDR 1
|
#define MSGR_SENDP_HDR 1
|
||||||
#define MSGR_SENDP_FREE 2
|
#define MSGR_SENDP_FREE 2
|
||||||
|
|
||||||
|
@ -58,6 +60,7 @@ struct osd_client_t
|
||||||
int ping_time_remaining = 0;
|
int ping_time_remaining = 0;
|
||||||
int idle_time_remaining = 0;
|
int idle_time_remaining = 0;
|
||||||
osd_num_t osd_num = 0;
|
osd_num_t osd_num = 0;
|
||||||
|
bool is_incoming = false;
|
||||||
|
|
||||||
void *in_buf = NULL;
|
void *in_buf = NULL;
|
||||||
|
|
||||||
|
@ -73,12 +76,16 @@ struct osd_client_t
|
||||||
int read_remaining = 0;
|
int read_remaining = 0;
|
||||||
int read_state = 0;
|
int read_state = 0;
|
||||||
osd_op_buf_list_t recv_list;
|
osd_op_buf_list_t recv_list;
|
||||||
|
uint64_t read_op_id = 1;
|
||||||
|
bool check_sequencing = false;
|
||||||
|
bool enable_pg_locks = false;
|
||||||
|
|
||||||
// Incoming operations
|
// Incoming operations
|
||||||
std::vector<osd_op_t*> received_ops;
|
std::vector<osd_op_t*> received_ops;
|
||||||
|
|
||||||
// Outbound operations
|
// Outbound operations
|
||||||
std::map<uint64_t, osd_op_t*> sent_ops;
|
std::map<uint64_t, osd_op_t*> sent_ops;
|
||||||
|
uint64_t send_op_id = 0;
|
||||||
|
|
||||||
// PGs dirtied by this client's primary-writes
|
// PGs dirtied by this client's primary-writes
|
||||||
std::set<pool_pg_num_t> dirty_pgs;
|
std::set<pool_pg_num_t> dirty_pgs;
|
||||||
|
@ -88,6 +95,7 @@ struct osd_client_t
|
||||||
int write_state = 0;
|
int write_state = 0;
|
||||||
std::vector<iovec> send_list, next_send_list;
|
std::vector<iovec> send_list, next_send_list;
|
||||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||||
|
std::vector<osd_op_t*> zc_free_list;
|
||||||
|
|
||||||
~osd_client_t();
|
~osd_client_t();
|
||||||
};
|
};
|
||||||
|
@ -97,6 +105,7 @@ struct osd_wanted_peer_t
|
||||||
json11::Json raw_address_list;
|
json11::Json raw_address_list;
|
||||||
json11::Json address_list;
|
json11::Json address_list;
|
||||||
int port = 0;
|
int port = 0;
|
||||||
|
// FIXME: Remove separate WITH_RDMACM?
|
||||||
#ifdef WITH_RDMACM
|
#ifdef WITH_RDMACM
|
||||||
int rdmacm_port = 0;
|
int rdmacm_port = 0;
|
||||||
#endif
|
#endif
|
||||||
|
@ -175,6 +184,7 @@ protected:
|
||||||
int osd_ping_timeout = 0;
|
int osd_ping_timeout = 0;
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
bool use_sync_send_recv = false;
|
bool use_sync_send_recv = false;
|
||||||
|
int min_zerocopy_send_size = DEFAULT_MIN_ZEROCOPY_SEND_SIZE;
|
||||||
int iothread_count = 0;
|
int iothread_count = 0;
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
|
@ -201,11 +211,11 @@ protected:
|
||||||
std::vector<osd_op_t*> set_immediate_ops;
|
std::vector<osd_op_t*> set_immediate_ops;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
timerfd_manager_t *tfd;
|
timerfd_manager_t *tfd = NULL;
|
||||||
ring_loop_t *ringloop;
|
ring_loop_t *ringloop = NULL;
|
||||||
|
bool has_sendmsg_zc = false;
|
||||||
// osd_num_t is only for logging and asserts
|
// osd_num_t is only for logging and asserts
|
||||||
osd_num_t osd_num;
|
osd_num_t osd_num;
|
||||||
uint64_t next_subop_id = 1;
|
|
||||||
std::map<int, osd_client_t*> clients;
|
std::map<int, osd_client_t*> clients;
|
||||||
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
||||||
std::map<uint64_t, int> osd_peer_fds;
|
std::map<uint64_t, int> osd_peer_fds;
|
||||||
|
@ -261,7 +271,7 @@ protected:
|
||||||
void cancel_op(osd_op_t *op);
|
void cancel_op(osd_op_t *op);
|
||||||
|
|
||||||
bool try_send(osd_client_t *cl);
|
bool try_send(osd_client_t *cl);
|
||||||
void handle_send(int result, osd_client_t *cl);
|
void handle_send(int result, bool prev, bool more, osd_client_t *cl);
|
||||||
|
|
||||||
bool handle_read(int result, osd_client_t *cl);
|
bool handle_read(int result, osd_client_t *cl);
|
||||||
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||||
|
@ -286,6 +296,7 @@ protected:
|
||||||
msgr_rdma_context_t* rdmacm_create_qp(rdma_cm_id *cmid);
|
msgr_rdma_context_t* rdmacm_create_qp(rdma_cm_id *cmid);
|
||||||
void rdmacm_accept(rdma_cm_event *ev);
|
void rdmacm_accept(rdma_cm_event *ev);
|
||||||
void rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port);
|
void rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port);
|
||||||
|
void rdmacm_set_conn_timeout(rdmacm_connecting_t *conn);
|
||||||
void rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res);
|
void rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res);
|
||||||
void rdmacm_address_resolved(rdma_cm_event *ev);
|
void rdmacm_address_resolved(rdma_cm_event *ev);
|
||||||
void rdmacm_route_resolved(rdma_cm_event *ev);
|
void rdmacm_route_resolved(rdma_cm_event *ev);
|
||||||
|
|
|
@ -70,6 +70,7 @@ msgr_rdma_context_t::~msgr_rdma_context_t()
|
||||||
msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||||
{
|
{
|
||||||
ctx->reserve_cqe(-max_send-max_recv);
|
ctx->reserve_cqe(-max_send-max_recv);
|
||||||
|
#ifdef WITH_RDMACM
|
||||||
if (qp && !cmid)
|
if (qp && !cmid)
|
||||||
ibv_destroy_qp(qp);
|
ibv_destroy_qp(qp);
|
||||||
if (cmid)
|
if (cmid)
|
||||||
|
@ -79,6 +80,10 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||||
rdma_destroy_qp(cmid);
|
rdma_destroy_qp(cmid);
|
||||||
rdma_destroy_id(cmid);
|
rdma_destroy_id(cmid);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
if (qp)
|
||||||
|
ibv_destroy_qp(qp);
|
||||||
|
#endif
|
||||||
if (recv_buffers.size())
|
if (recv_buffers.size())
|
||||||
{
|
{
|
||||||
for (auto b: recv_buffers)
|
for (auto b: recv_buffers)
|
||||||
|
@ -798,6 +803,9 @@ void osd_messenger_t::handle_rdma_events(msgr_rdma_context_t *rdma_context)
|
||||||
}
|
}
|
||||||
if (!is_send)
|
if (!is_send)
|
||||||
{
|
{
|
||||||
|
// Reset OSD ping state - client is obviously alive
|
||||||
|
cl->ping_time_remaining = 0;
|
||||||
|
cl->idle_time_remaining = osd_idle_timeout;
|
||||||
rc->cur_recv--;
|
rc->cur_recv--;
|
||||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||||
{
|
{
|
||||||
|
|
|
@ -70,7 +70,7 @@ void osd_messenger_t::rdmacm_destroy_listener(rdma_cm_id *listener)
|
||||||
|
|
||||||
void osd_messenger_t::handle_rdmacm_events()
|
void osd_messenger_t::handle_rdmacm_events()
|
||||||
{
|
{
|
||||||
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(
|
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(...
|
||||||
std::vector<rdma_cm_event> events_copy;
|
std::vector<rdma_cm_event> events_copy;
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
|
@ -83,7 +83,15 @@ void osd_messenger_t::handle_rdmacm_events()
|
||||||
fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
|
fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
// ...so we save a copy of all events EXCEPT connection requests, otherwise they sometimes fail with EVENT_DISCONNECT
|
||||||
|
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
||||||
|
{
|
||||||
|
rdmacm_accept(ev);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
events_copy.push_back(*ev);
|
events_copy.push_back(*ev);
|
||||||
|
}
|
||||||
r = rdma_ack_cm_event(ev);
|
r = rdma_ack_cm_event(ev);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
|
@ -96,7 +104,7 @@ void osd_messenger_t::handle_rdmacm_events()
|
||||||
auto ev = &evl;
|
auto ev = &evl;
|
||||||
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
||||||
{
|
{
|
||||||
rdmacm_accept(ev);
|
// Do nothing, handled above
|
||||||
}
|
}
|
||||||
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
|
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
|
||||||
ev->event == RDMA_CM_EVENT_REJECTED ||
|
ev->event == RDMA_CM_EVENT_REJECTED ||
|
||||||
|
@ -287,29 +295,34 @@ void osd_messenger_t::rdmacm_accept(rdma_cm_event *ev)
|
||||||
rdma_destroy_id(ev->id);
|
rdma_destroy_id(ev->id);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
rdma_context->cm_refs++;
|
// Wait for RDMA_CM_ESTABLISHED, and enable the connection only after it
|
||||||
// Wrap into a new msgr_rdma_connection_t
|
auto conn = new rdmacm_connecting_t;
|
||||||
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
|
|
||||||
conn->ctx = rdma_context;
|
|
||||||
conn->max_send = rdma_max_send;
|
|
||||||
conn->max_recv = rdma_max_recv;
|
|
||||||
conn->max_sge = rdma_max_sge > rdma_context->attrx.orig_attr.max_sge
|
|
||||||
? rdma_context->attrx.orig_attr.max_sge : rdma_max_sge;
|
|
||||||
conn->max_msg = rdma_max_msg;
|
|
||||||
conn->cmid = ev->id;
|
conn->cmid = ev->id;
|
||||||
conn->qp = ev->id->qp;
|
conn->peer_fd = fake_fd;
|
||||||
auto cl = new osd_client_t();
|
conn->parsed_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
|
||||||
cl->peer_fd = fake_fd;
|
conn->rdma_context = rdma_context;
|
||||||
cl->peer_state = PEER_RDMA;
|
rdmacm_set_conn_timeout(conn);
|
||||||
cl->peer_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
|
rdmacm_connecting[ev->id] = conn;
|
||||||
cl->in_buf = malloc_or_die(receive_buffer_size);
|
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, conn->peer_fd,
|
||||||
cl->rdma_conn = conn;
|
addr_to_string(conn->parsed_addr).c_str());
|
||||||
clients[fake_fd] = cl;
|
}
|
||||||
rdmacm_connections[ev->id] = cl;
|
|
||||||
// Add initial receive request(s)
|
void osd_messenger_t::rdmacm_set_conn_timeout(rdmacm_connecting_t *conn)
|
||||||
try_recv_rdma(cl);
|
{
|
||||||
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, fake_fd,
|
conn->timeout_ms = peer_connect_timeout*1000;
|
||||||
addr_to_string(cl->peer_addr).c_str());
|
if (peer_connect_timeout > 0)
|
||||||
|
{
|
||||||
|
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid = conn->cmid](int timer_id)
|
||||||
|
{
|
||||||
|
auto conn = rdmacm_connecting.at(cmid);
|
||||||
|
conn->timeout_id = -1;
|
||||||
|
if (conn->peer_osd)
|
||||||
|
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
|
||||||
|
else
|
||||||
|
fprintf(stderr, "Incoming RDMA-CM connection from %s timed out\n", addr_to_string(conn->parsed_addr).c_str());
|
||||||
|
rdmacm_on_connect_peer_error(cmid, -EPIPE);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
|
void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
|
||||||
|
@ -332,6 +345,8 @@ void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
|
||||||
}
|
}
|
||||||
rdmacm_connecting.erase(cmid);
|
rdmacm_connecting.erase(cmid);
|
||||||
delete conn;
|
delete conn;
|
||||||
|
if (peer_osd)
|
||||||
|
{
|
||||||
if (!disable_tcp)
|
if (!disable_tcp)
|
||||||
{
|
{
|
||||||
// Fall back to TCP instead of just reporting the error to on_connect_peer()
|
// Fall back to TCP instead of just reporting the error to on_connect_peer()
|
||||||
|
@ -343,6 +358,7 @@ void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
|
||||||
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
|
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port)
|
void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port)
|
||||||
{
|
{
|
||||||
|
@ -374,6 +390,8 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
|
||||||
on_connect_peer(peer_osd, res);
|
on_connect_peer(peer_osd, res);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "Trying to connect to OSD %ju at %s:%d via RDMA-CM\n", peer_osd, addr.c_str(), rdmacm_port);
|
||||||
auto conn = new rdmacm_connecting_t;
|
auto conn = new rdmacm_connecting_t;
|
||||||
rdmacm_connecting[cmid] = conn;
|
rdmacm_connecting[cmid] = conn;
|
||||||
conn->cmid = cmid;
|
conn->cmid = cmid;
|
||||||
|
@ -383,19 +401,7 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
|
||||||
conn->parsed_addr = sa;
|
conn->parsed_addr = sa;
|
||||||
conn->rdmacm_port = rdmacm_port;
|
conn->rdmacm_port = rdmacm_port;
|
||||||
conn->tcp_port = fallback_tcp_port;
|
conn->tcp_port = fallback_tcp_port;
|
||||||
conn->timeout_ms = peer_connect_timeout*1000;
|
rdmacm_set_conn_timeout(conn);
|
||||||
conn->timeout_id = -1;
|
|
||||||
if (peer_connect_timeout > 0)
|
|
||||||
{
|
|
||||||
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid](int timer_id)
|
|
||||||
{
|
|
||||||
auto conn = rdmacm_connecting.at(cmid);
|
|
||||||
conn->timeout_id = -1;
|
|
||||||
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
|
|
||||||
rdmacm_on_connect_peer_error(cmid, -EPIPE);
|
|
||||||
return;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0)
|
if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0)
|
||||||
{
|
{
|
||||||
auto res = -errno;
|
auto res = -errno;
|
||||||
|
@ -494,7 +500,7 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
|
||||||
// Wrap into a new msgr_rdma_connection_t
|
// Wrap into a new msgr_rdma_connection_t
|
||||||
msgr_rdma_connection_t *rc = new msgr_rdma_connection_t;
|
msgr_rdma_connection_t *rc = new msgr_rdma_connection_t;
|
||||||
rc->ctx = conn->rdma_context;
|
rc->ctx = conn->rdma_context;
|
||||||
rc->ctx->cm_refs++;
|
rc->ctx->cm_refs++; // FIXME now unused, count also connecting_t's when used
|
||||||
rc->max_send = rdma_max_send;
|
rc->max_send = rdma_max_send;
|
||||||
rc->max_recv = rdma_max_recv;
|
rc->max_recv = rdma_max_recv;
|
||||||
rc->max_sge = rdma_max_sge > rc->ctx->attrx.orig_attr.max_sge
|
rc->max_sge = rdma_max_sge > rc->ctx->attrx.orig_attr.max_sge
|
||||||
|
@ -504,6 +510,7 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
|
||||||
rc->qp = conn->cmid->qp;
|
rc->qp = conn->cmid->qp;
|
||||||
// And an osd_client_t
|
// And an osd_client_t
|
||||||
auto cl = new osd_client_t();
|
auto cl = new osd_client_t();
|
||||||
|
cl->is_incoming = true;
|
||||||
cl->peer_addr = conn->parsed_addr;
|
cl->peer_addr = conn->parsed_addr;
|
||||||
cl->peer_port = conn->rdmacm_port;
|
cl->peer_port = conn->rdmacm_port;
|
||||||
cl->peer_fd = conn->peer_fd;
|
cl->peer_fd = conn->peer_fd;
|
||||||
|
@ -514,14 +521,20 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
|
||||||
cl->rdma_conn = rc;
|
cl->rdma_conn = rc;
|
||||||
clients[conn->peer_fd] = cl;
|
clients[conn->peer_fd] = cl;
|
||||||
if (conn->timeout_id >= 0)
|
if (conn->timeout_id >= 0)
|
||||||
|
{
|
||||||
tfd->clear_timer(conn->timeout_id);
|
tfd->clear_timer(conn->timeout_id);
|
||||||
|
}
|
||||||
delete conn;
|
delete conn;
|
||||||
rdmacm_connecting.erase(cmid);
|
rdmacm_connecting.erase(cmid);
|
||||||
rdmacm_connections[cmid] = cl;
|
rdmacm_connections[cmid] = cl;
|
||||||
if (log_level > 0)
|
if (log_level > 0 && peer_osd)
|
||||||
|
{
|
||||||
fprintf(stderr, "Successfully connected with OSD %ju using RDMA-CM\n", peer_osd);
|
fprintf(stderr, "Successfully connected with OSD %ju using RDMA-CM\n", peer_osd);
|
||||||
|
}
|
||||||
// Add initial receive request(s)
|
// Add initial receive request(s)
|
||||||
try_recv_rdma(cl);
|
try_recv_rdma(cl);
|
||||||
osd_peer_fds[peer_osd] = cl->peer_fd;
|
if (peer_osd)
|
||||||
on_connect_peer(peer_osd, cl->peer_fd);
|
{
|
||||||
|
check_peer_config(cl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,8 +33,12 @@ void osd_messenger_t::read_requests()
|
||||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||||
io_uring_sqe sqe_local;
|
io_uring_sqe sqe_local;
|
||||||
ring_data_t data_local;
|
ring_data_t data_local;
|
||||||
sqe_local.user_data = (uint64_t)&data_local;
|
|
||||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||||
|
if (iothread)
|
||||||
|
{
|
||||||
|
sqe_local = { .user_data = (uint64_t)&data_local };
|
||||||
|
data_local = {};
|
||||||
|
}
|
||||||
if (!sqe)
|
if (!sqe)
|
||||||
{
|
{
|
||||||
cl->read_msg.msg_iovlen = 0;
|
cl->read_msg.msg_iovlen = 0;
|
||||||
|
@ -214,6 +218,7 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
|
||||||
|
|
||||||
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
|
// Reset OSD ping state
|
||||||
cl->ping_time_remaining = 0;
|
cl->ping_time_remaining = 0;
|
||||||
cl->idle_time_remaining = osd_idle_timeout;
|
cl->idle_time_remaining = osd_idle_timeout;
|
||||||
cl->recv_list.reset();
|
cl->recv_list.reset();
|
||||||
|
@ -222,7 +227,19 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||||
if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
|
if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
|
||||||
return handle_reply_hdr(cl);
|
return handle_reply_hdr(cl);
|
||||||
else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
|
else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
|
||||||
|
{
|
||||||
|
if (cl->check_sequencing)
|
||||||
|
{
|
||||||
|
if (cl->read_op->req.hdr.id != cl->read_op_id)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Warning: operation sequencing is broken on client %d, stopping client\n", cl->peer_fd);
|
||||||
|
stop_client(cl->peer_fd);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
cl->read_op_id++;
|
||||||
|
}
|
||||||
handle_op_hdr(cl);
|
handle_op_hdr(cl);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
||||||
|
|
|
@ -14,6 +14,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||||
if (cur_op->op_type == OSD_OP_OUT)
|
if (cur_op->op_type == OSD_OP_OUT)
|
||||||
{
|
{
|
||||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
||||||
|
cur_op->req.hdr.id = ++cl->send_op_id;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -193,18 +194,36 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||||
io_uring_sqe sqe_local;
|
io_uring_sqe sqe_local;
|
||||||
ring_data_t data_local;
|
ring_data_t data_local;
|
||||||
sqe_local.user_data = (uint64_t)&data_local;
|
|
||||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||||
if (!sqe)
|
if (iothread)
|
||||||
{
|
{
|
||||||
return false;
|
sqe_local = { .user_data = (uint64_t)&data_local };
|
||||||
|
data_local = {};
|
||||||
}
|
}
|
||||||
|
if (!sqe)
|
||||||
|
return false;
|
||||||
cl->write_msg.msg_iov = cl->send_list.data();
|
cl->write_msg.msg_iov = cl->send_list.data();
|
||||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||||
cl->refs++;
|
cl->refs++;
|
||||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||||
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, data->prev, data->more, cl); };
|
||||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
bool use_zc = has_sendmsg_zc && min_zerocopy_send_size >= 0;
|
||||||
|
if (use_zc && min_zerocopy_send_size > 0)
|
||||||
|
{
|
||||||
|
size_t avg_size = 0;
|
||||||
|
for (size_t i = 0; i < cl->write_msg.msg_iovlen; i++)
|
||||||
|
avg_size += cl->write_msg.msg_iov[i].iov_len;
|
||||||
|
if (avg_size/cl->write_msg.msg_iovlen < min_zerocopy_send_size)
|
||||||
|
use_zc = false;
|
||||||
|
}
|
||||||
|
if (use_zc)
|
||||||
|
{
|
||||||
|
my_uring_prep_sendmsg_zc(sqe, peer_fd, &cl->write_msg, MSG_WAITALL);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, MSG_WAITALL);
|
||||||
|
}
|
||||||
if (iothread)
|
if (iothread)
|
||||||
{
|
{
|
||||||
iothread->add_sqe(sqe_local);
|
iothread->add_sqe(sqe_local);
|
||||||
|
@ -220,7 +239,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
result = -errno;
|
result = -errno;
|
||||||
}
|
}
|
||||||
handle_send(result, cl);
|
handle_send(result, false, false, cl);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -240,10 +259,16 @@ void osd_messenger_t::send_replies()
|
||||||
write_ready_clients.clear();
|
write_ready_clients.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
void osd_messenger_t::handle_send(int result, bool prev, bool more, osd_client_t *cl)
|
||||||
|
{
|
||||||
|
if (!prev)
|
||||||
{
|
{
|
||||||
cl->write_msg.msg_iovlen = 0;
|
cl->write_msg.msg_iovlen = 0;
|
||||||
|
}
|
||||||
|
if (!more)
|
||||||
|
{
|
||||||
cl->refs--;
|
cl->refs--;
|
||||||
|
}
|
||||||
if (cl->peer_state == PEER_STOPPED)
|
if (cl->peer_state == PEER_STOPPED)
|
||||||
{
|
{
|
||||||
if (cl->refs <= 0)
|
if (cl->refs <= 0)
|
||||||
|
@ -261,6 +286,16 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||||
}
|
}
|
||||||
if (result >= 0)
|
if (result >= 0)
|
||||||
{
|
{
|
||||||
|
if (prev)
|
||||||
|
{
|
||||||
|
// Second notification - only free a batch of postponed ops
|
||||||
|
int i = 0;
|
||||||
|
for (; i < cl->zc_free_list.size() && cl->zc_free_list[i]; i++)
|
||||||
|
delete cl->zc_free_list[i];
|
||||||
|
if (i > 0)
|
||||||
|
cl->zc_free_list.erase(cl->zc_free_list.begin(), cl->zc_free_list.begin()+i+1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
int done = 0;
|
int done = 0;
|
||||||
while (result > 0 && done < cl->send_list.size())
|
while (result > 0 && done < cl->send_list.size())
|
||||||
{
|
{
|
||||||
|
@ -270,6 +305,9 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||||
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||||
{
|
{
|
||||||
// Reply fully sent
|
// Reply fully sent
|
||||||
|
if (more)
|
||||||
|
cl->zc_free_list.push_back(cl->outbox[done].op);
|
||||||
|
else
|
||||||
delete cl->outbox[done].op;
|
delete cl->outbox[done].op;
|
||||||
}
|
}
|
||||||
result -= iov.iov_len;
|
result -= iov.iov_len;
|
||||||
|
@ -282,6 +320,12 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (more)
|
||||||
|
{
|
||||||
|
auto expected = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||||
|
assert(done == expected);
|
||||||
|
cl->zc_free_list.push_back(NULL); // end marker
|
||||||
|
}
|
||||||
if (done > 0)
|
if (done > 0)
|
||||||
{
|
{
|
||||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
||||||
|
|
|
@ -23,4 +23,5 @@ const char* osd_op_names[] = {
|
||||||
"sec_read_bmp",
|
"sec_read_bmp",
|
||||||
"scrub",
|
"scrub",
|
||||||
"describe",
|
"describe",
|
||||||
|
"sec_lock",
|
||||||
};
|
};
|
||||||
|
|
|
@ -31,10 +31,13 @@
|
||||||
#define OSD_OP_SEC_READ_BMP 16
|
#define OSD_OP_SEC_READ_BMP 16
|
||||||
#define OSD_OP_SCRUB 17
|
#define OSD_OP_SCRUB 17
|
||||||
#define OSD_OP_DESCRIBE 18
|
#define OSD_OP_DESCRIBE 18
|
||||||
#define OSD_OP_MAX 18
|
#define OSD_OP_SEC_LOCK 19
|
||||||
|
#define OSD_OP_MAX 19
|
||||||
#define OSD_RW_MAX 64*1024*1024
|
#define OSD_RW_MAX 64*1024*1024
|
||||||
#define OSD_PROTOCOL_VERSION 1
|
#define OSD_PROTOCOL_VERSION 1
|
||||||
|
|
||||||
#define OSD_OP_RECOVERY_RELATED (uint32_t)1
|
#define OSD_OP_RECOVERY_RELATED (uint32_t)1
|
||||||
|
#define OSD_OP_IGNORE_PG_LOCK (uint32_t)2
|
||||||
|
|
||||||
// Memory alignment for direct I/O (usually 512 bytes)
|
// Memory alignment for direct I/O (usually 512 bytes)
|
||||||
#ifndef DIRECT_IO_ALIGNMENT
|
#ifndef DIRECT_IO_ALIGNMENT
|
||||||
|
@ -56,6 +59,9 @@
|
||||||
#define OSD_DEL_SUPPORT_LEFT_ON_DEAD 1
|
#define OSD_DEL_SUPPORT_LEFT_ON_DEAD 1
|
||||||
#define OSD_DEL_LEFT_ON_DEAD 2
|
#define OSD_DEL_LEFT_ON_DEAD 2
|
||||||
|
|
||||||
|
#define OSD_SEC_LOCK_PG 1
|
||||||
|
#define OSD_SEC_UNLOCK_PG 2
|
||||||
|
|
||||||
// common request and reply headers
|
// common request and reply headers
|
||||||
struct __attribute__((__packed__)) osd_op_header_t
|
struct __attribute__((__packed__)) osd_op_header_t
|
||||||
{
|
{
|
||||||
|
@ -94,7 +100,7 @@ struct __attribute__((__packed__)) osd_op_sec_rw_t
|
||||||
uint32_t len;
|
uint32_t len;
|
||||||
// bitmap/attribute length - bitmap comes after header, but before data
|
// bitmap/attribute length - bitmap comes after header, but before data
|
||||||
uint32_t attr_len;
|
uint32_t attr_len;
|
||||||
// the only possible flag is OSD_OP_RECOVERY_RELATED
|
// OSD_OP_RECOVERY_RELATED, OSD_OP_IGNORE_PG_LOCK
|
||||||
uint32_t flags;
|
uint32_t flags;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -116,7 +122,7 @@ struct __attribute__((__packed__)) osd_op_sec_del_t
|
||||||
object_id oid;
|
object_id oid;
|
||||||
// delete version (automatic or specific)
|
// delete version (automatic or specific)
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
// the only possible flag is OSD_OP_RECOVERY_RELATED
|
// OSD_OP_RECOVERY_RELATED, OSD_OP_IGNORE_PG_LOCK
|
||||||
uint32_t flags;
|
uint32_t flags;
|
||||||
uint32_t pad0;
|
uint32_t pad0;
|
||||||
};
|
};
|
||||||
|
@ -131,7 +137,7 @@ struct __attribute__((__packed__)) osd_reply_sec_del_t
|
||||||
struct __attribute__((__packed__)) osd_op_sec_sync_t
|
struct __attribute__((__packed__)) osd_op_sec_sync_t
|
||||||
{
|
{
|
||||||
osd_op_header_t header;
|
osd_op_header_t header;
|
||||||
// the only possible flag is OSD_OP_RECOVERY_RELATED
|
// OSD_OP_RECOVERY_RELATED, OSD_OP_IGNORE_PG_LOCK
|
||||||
uint32_t flags;
|
uint32_t flags;
|
||||||
uint32_t pad0;
|
uint32_t pad0;
|
||||||
};
|
};
|
||||||
|
@ -147,7 +153,7 @@ struct __attribute__((__packed__)) osd_op_sec_stab_t
|
||||||
osd_op_header_t header;
|
osd_op_header_t header;
|
||||||
// obj_ver_id array length in bytes
|
// obj_ver_id array length in bytes
|
||||||
uint64_t len;
|
uint64_t len;
|
||||||
// the only possible flag is OSD_OP_RECOVERY_RELATED
|
// OSD_OP_RECOVERY_RELATED, OSD_OP_IGNORE_PG_LOCK
|
||||||
uint32_t flags;
|
uint32_t flags;
|
||||||
uint32_t pad0;
|
uint32_t pad0;
|
||||||
};
|
};
|
||||||
|
@ -165,6 +171,8 @@ struct __attribute__((__packed__)) osd_op_sec_read_bmp_t
|
||||||
osd_op_header_t header;
|
osd_op_header_t header;
|
||||||
// obj_ver_id array length in bytes
|
// obj_ver_id array length in bytes
|
||||||
uint64_t len;
|
uint64_t len;
|
||||||
|
// OSD_OP_RECOVERY_RELATED, OSD_OP_IGNORE_PG_LOCK
|
||||||
|
uint32_t flags;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct __attribute__((__packed__)) osd_reply_sec_read_bmp_t
|
struct __attribute__((__packed__)) osd_reply_sec_read_bmp_t
|
||||||
|
@ -173,7 +181,7 @@ struct __attribute__((__packed__)) osd_reply_sec_read_bmp_t
|
||||||
osd_reply_header_t header;
|
osd_reply_header_t header;
|
||||||
};
|
};
|
||||||
|
|
||||||
// show configuration
|
// show configuration and remember peer information
|
||||||
struct __attribute__((__packed__)) osd_op_show_config_t
|
struct __attribute__((__packed__)) osd_op_show_config_t
|
||||||
{
|
{
|
||||||
osd_op_header_t header;
|
osd_op_header_t header;
|
||||||
|
@ -303,6 +311,25 @@ struct __attribute__((__packed__)) osd_reply_describe_item_t
|
||||||
osd_num_t osd_num; // OSD number
|
osd_num_t osd_num; // OSD number
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// lock/unlock PG for use by a primary OSD
|
||||||
|
struct __attribute__((__packed__)) osd_op_sec_lock_t
|
||||||
|
{
|
||||||
|
osd_op_header_t header;
|
||||||
|
// OSD_SEC_LOCK_PG or OSD_SEC_UNLOCK_PG
|
||||||
|
uint64_t flags;
|
||||||
|
// Pool ID and PG number
|
||||||
|
uint64_t pool_id;
|
||||||
|
uint64_t pg_num;
|
||||||
|
// PG state as calculated by the primary OSD
|
||||||
|
uint64_t pg_state;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __attribute__((__packed__)) osd_reply_sec_lock_t
|
||||||
|
{
|
||||||
|
osd_reply_header_t header;
|
||||||
|
uint64_t cur_primary;
|
||||||
|
};
|
||||||
|
|
||||||
// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
|
// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
|
||||||
union osd_any_op_t
|
union osd_any_op_t
|
||||||
{
|
{
|
||||||
|
@ -313,6 +340,7 @@ union osd_any_op_t
|
||||||
osd_op_sec_stab_t sec_stab;
|
osd_op_sec_stab_t sec_stab;
|
||||||
osd_op_sec_read_bmp_t sec_read_bmp;
|
osd_op_sec_read_bmp_t sec_read_bmp;
|
||||||
osd_op_sec_list_t sec_list;
|
osd_op_sec_list_t sec_list;
|
||||||
|
osd_op_sec_lock_t sec_lock;
|
||||||
osd_op_show_config_t show_conf;
|
osd_op_show_config_t show_conf;
|
||||||
osd_op_rw_t rw;
|
osd_op_rw_t rw;
|
||||||
osd_op_sync_t sync;
|
osd_op_sync_t sync;
|
||||||
|
@ -329,6 +357,7 @@ union osd_any_reply_t
|
||||||
osd_reply_sec_stab_t sec_stab;
|
osd_reply_sec_stab_t sec_stab;
|
||||||
osd_reply_sec_read_bmp_t sec_read_bmp;
|
osd_reply_sec_read_bmp_t sec_read_bmp;
|
||||||
osd_reply_sec_list_t sec_list;
|
osd_reply_sec_list_t sec_list;
|
||||||
|
osd_reply_sec_lock_t sec_lock;
|
||||||
osd_reply_show_config_t show_conf;
|
osd_reply_show_config_t show_conf;
|
||||||
osd_reply_rw_t rw;
|
osd_reply_rw_t rw;
|
||||||
osd_reply_del_t del;
|
osd_reply_del_t del;
|
||||||
|
|
|
@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 2.1.0
|
Version: 2.2.0
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
|
|
@ -185,6 +185,7 @@ static const char* help_text =
|
||||||
" --immediate_commit all Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
|
" --immediate_commit all Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
|
||||||
" --level_placement <rules> Use additional failure domain rules (example: \"dc=112233\")\n"
|
" --level_placement <rules> Use additional failure domain rules (example: \"dc=112233\")\n"
|
||||||
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
||||||
|
" --local_reads primary Local read policy for replicated pools: primary, nearest or random\n"
|
||||||
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
||||||
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
||||||
" --used_for_app fs:<name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
" --used_for_app fs:<name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
||||||
|
@ -282,6 +283,7 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||||
!strcmp(opt, "readonly") || !strcmp(opt, "readwrite") ||
|
!strcmp(opt, "readonly") || !strcmp(opt, "readwrite") ||
|
||||||
!strcmp(opt, "force") || !strcmp(opt, "reverse") ||
|
!strcmp(opt, "force") || !strcmp(opt, "reverse") ||
|
||||||
!strcmp(opt, "allow-data-loss") || !strcmp(opt, "allow_data_loss") ||
|
!strcmp(opt, "allow-data-loss") || !strcmp(opt, "allow_data_loss") ||
|
||||||
|
!strcmp(opt, "allow-up") || !strcmp(opt, "allow_up") ||
|
||||||
!strcmp(opt, "down-ok") || !strcmp(opt, "down_ok") ||
|
!strcmp(opt, "down-ok") || !strcmp(opt, "down_ok") ||
|
||||||
!strcmp(opt, "dry-run") || !strcmp(opt, "dry_run") ||
|
!strcmp(opt, "dry-run") || !strcmp(opt, "dry_run") ||
|
||||||
!strcmp(opt, "help") || !strcmp(opt, "all") ||
|
!strcmp(opt, "help") || !strcmp(opt, "all") ||
|
||||||
|
|
|
@ -147,7 +147,6 @@ struct cli_describe_t
|
||||||
.describe = (osd_op_describe_t){
|
.describe = (osd_op_describe_t){
|
||||||
.header = (osd_op_header_t){
|
.header = (osd_op_header_t){
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = parent->cli->next_op_id(),
|
|
||||||
.opcode = OSD_OP_DESCRIBE,
|
.opcode = OSD_OP_DESCRIBE,
|
||||||
},
|
},
|
||||||
.object_state = object_state,
|
.object_state = object_state,
|
||||||
|
|
|
@ -159,7 +159,6 @@ struct cli_fix_t
|
||||||
.describe = {
|
.describe = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = parent->cli->next_op_id(),
|
|
||||||
.opcode = OSD_OP_DESCRIBE,
|
.opcode = OSD_OP_DESCRIBE,
|
||||||
},
|
},
|
||||||
.min_inode = obj.inode,
|
.min_inode = obj.inode,
|
||||||
|
@ -194,7 +193,6 @@ struct cli_fix_t
|
||||||
.sec_del = {
|
.sec_del = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = parent->cli->next_op_id(),
|
|
||||||
.opcode = OSD_OP_SEC_DELETE,
|
.opcode = OSD_OP_SEC_DELETE,
|
||||||
},
|
},
|
||||||
.oid = {
|
.oid = {
|
||||||
|
@ -202,6 +200,7 @@ struct cli_fix_t
|
||||||
.stripe = op->req.describe.min_offset | items[i].role,
|
.stripe = op->req.describe.min_offset | items[i].role,
|
||||||
},
|
},
|
||||||
.version = 0,
|
.version = 0,
|
||||||
|
.flags = OSD_OP_IGNORE_PG_LOCK,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
rm_op->callback = [this, primary_osd, rm_osd_num, rm_count, &obj](osd_op_t *rm_op)
|
rm_op->callback = [this, primary_osd, rm_osd_num, rm_count, &obj](osd_op_t *rm_op)
|
||||||
|
@ -242,7 +241,6 @@ struct cli_fix_t
|
||||||
.rw = {
|
.rw = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = parent->cli->next_op_id(),
|
|
||||||
.opcode = OSD_OP_SCRUB,
|
.opcode = OSD_OP_SCRUB,
|
||||||
},
|
},
|
||||||
.inode = obj.inode,
|
.inode = obj.inode,
|
||||||
|
|
|
@ -58,6 +58,12 @@ struct osd_changer_t
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (set_reweight && new_reweight > 1)
|
||||||
|
{
|
||||||
|
result = (cli_result_t){ .err = EINVAL, .text = "Reweight can't be larger than 1" };
|
||||||
|
state = 100;
|
||||||
|
return;
|
||||||
|
}
|
||||||
parent->etcd_txn(json11::Json::object {
|
parent->etcd_txn(json11::Json::object {
|
||||||
{ "success", json11::Json::array {
|
{ "success", json11::Json::array {
|
||||||
json11::Json::object {
|
json11::Json::object {
|
||||||
|
|
|
@ -44,10 +44,10 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||||
new_cfg["parity_chunks"] = parity_chunks;
|
new_cfg["parity_chunks"] = parity_chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (old_cfg.is_null() && new_cfg["scheme"].string_value() == "")
|
if (new_cfg["scheme"].string_value() == "")
|
||||||
{
|
{
|
||||||
// Default scheme
|
// Default scheme
|
||||||
new_cfg["scheme"] = "replicated";
|
new_cfg["scheme"] = old_cfg.is_null() ? "replicated" : old_cfg["scheme"];
|
||||||
}
|
}
|
||||||
if (new_cfg.find("pg_minsize") == new_cfg.end() && (old_cfg.is_null() || new_cfg.find("pg_size") != new_cfg.end()))
|
if (new_cfg.find("pg_minsize") == new_cfg.end() && (old_cfg.is_null() || new_cfg.find("pg_size") != new_cfg.end()))
|
||||||
{
|
{
|
||||||
|
@ -91,7 +91,7 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||||
}
|
}
|
||||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_app" ||
|
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_app" ||
|
||||||
key == "used_for_fs" || key == "raw_placement")
|
key == "used_for_fs" || key == "raw_placement" || key == "local_reads")
|
||||||
{
|
{
|
||||||
if (!value.is_string())
|
if (!value.is_string())
|
||||||
{
|
{
|
||||||
|
@ -165,6 +165,10 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||||
new_cfg["used_for_app"] = "fs:"+new_cfg["used_for_fs"].string_value();
|
new_cfg["used_for_app"] = "fs:"+new_cfg["used_for_fs"].string_value();
|
||||||
new_cfg.erase("used_for_fs");
|
new_cfg.erase("used_for_fs");
|
||||||
}
|
}
|
||||||
|
if (new_cfg.find("local_reads") != new_cfg.end() && new_cfg["local_reads"].string_value() == "primary")
|
||||||
|
{
|
||||||
|
new_cfg.erase("local_reads");
|
||||||
|
}
|
||||||
|
|
||||||
// Prevent autovivification of object keys. Now we don't modify the config, we just check it
|
// Prevent autovivification of object keys. Now we don't modify the config, we just check it
|
||||||
json11::Json cfg = new_cfg;
|
json11::Json cfg = new_cfg;
|
||||||
|
@ -340,5 +344,19 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// local_reads
|
||||||
|
if (!cfg["local_reads"].is_null())
|
||||||
|
{
|
||||||
|
auto lr = cfg["local_reads"].string_value();
|
||||||
|
if (lr != "" && lr != "primary" && lr != "nearest" && lr != "random")
|
||||||
|
{
|
||||||
|
return "local_reads must be '', 'primary', 'nearest' or 'random', but it is "+cfg["local_reads"].string_value();
|
||||||
|
}
|
||||||
|
if (lr != "" && lr != "primary" && scheme != POOL_SCHEME_REPLICATED)
|
||||||
|
{
|
||||||
|
return "EC pools don't support localized reads, please clear local_reads or set it to 'primary'";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
|
@ -504,6 +504,7 @@ resume_3:
|
||||||
{ "failure_domain", "Failure domain" },
|
{ "failure_domain", "Failure domain" },
|
||||||
{ "root_node", "Root node" },
|
{ "root_node", "Root node" },
|
||||||
{ "osd_tags_fmt", "OSD tags" },
|
{ "osd_tags_fmt", "OSD tags" },
|
||||||
|
{ "local_reads", "Local read policy" },
|
||||||
{ "primary_affinity_tags_fmt", "Primary affinity" },
|
{ "primary_affinity_tags_fmt", "Primary affinity" },
|
||||||
{ "block_size_fmt", "Block size" },
|
{ "block_size_fmt", "Block size" },
|
||||||
{ "bitmap_granularity_fmt", "Bitmap granularity" },
|
{ "bitmap_granularity_fmt", "Bitmap granularity" },
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include "cli.h"
|
#include "cli.h"
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
#include "json_util.h"
|
||||||
#include "epoll_manager.h"
|
#include "epoll_manager.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -14,7 +15,7 @@ struct rm_osd_t
|
||||||
{
|
{
|
||||||
cli_tool_t *parent;
|
cli_tool_t *parent;
|
||||||
|
|
||||||
bool dry_run, force_warning, force_dataloss;
|
bool dry_run, force_warning, force_dataloss, allow_up;
|
||||||
uint64_t etcd_tx_retry_ms = 500;
|
uint64_t etcd_tx_retry_ms = 500;
|
||||||
uint64_t etcd_tx_retries = 10000;
|
uint64_t etcd_tx_retries = 10000;
|
||||||
std::vector<uint64_t> osd_ids;
|
std::vector<uint64_t> osd_ids;
|
||||||
|
@ -22,8 +23,8 @@ struct rm_osd_t
|
||||||
int state = 0;
|
int state = 0;
|
||||||
cli_result_t result;
|
cli_result_t result;
|
||||||
|
|
||||||
std::set<uint64_t> to_remove;
|
std::set<osd_num_t> to_remove;
|
||||||
std::set<uint64_t> to_restart;
|
std::vector<osd_num_t> still_up;
|
||||||
json11::Json::array pool_effects;
|
json11::Json::array pool_effects;
|
||||||
json11::Json::array history_updates, history_checks;
|
json11::Json::array history_updates, history_checks;
|
||||||
json11::Json new_pgs, new_clean_pgs;
|
json11::Json new_pgs, new_clean_pgs;
|
||||||
|
@ -63,8 +64,17 @@ struct rm_osd_t
|
||||||
}
|
}
|
||||||
to_remove.insert(osd_id);
|
to_remove.insert(osd_id);
|
||||||
}
|
}
|
||||||
// Check if OSDs are still used in data distribution
|
|
||||||
is_warning = is_dataloss = false;
|
is_warning = is_dataloss = false;
|
||||||
|
// Check if OSDs are still up
|
||||||
|
for (auto osd_id: to_remove)
|
||||||
|
{
|
||||||
|
if (parent->cli->st_cli.peer_states.find(osd_id) != parent->cli->st_cli.peer_states.end())
|
||||||
|
{
|
||||||
|
is_warning = !allow_up;
|
||||||
|
still_up.push_back(osd_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Check if OSDs are still used in data distribution
|
||||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||||
{
|
{
|
||||||
// Will OSD deletion make pool incomplete / down / degraded?
|
// Will OSD deletion make pool incomplete / down / degraded?
|
||||||
|
@ -158,6 +168,9 @@ struct rm_osd_t
|
||||||
: strtoupper(e["effect"].string_value())+" PGs"))
|
: strtoupper(e["effect"].string_value())+" PGs"))
|
||||||
)+" after deleting OSD(s).\n";
|
)+" after deleting OSD(s).\n";
|
||||||
}
|
}
|
||||||
|
if (still_up.size() && !allow_up)
|
||||||
|
error += (still_up.size() == 1 ? "OSD " : "OSDs ") + implode(", ", still_up) +
|
||||||
|
(still_up.size() == 1 ? "is" : "are") + " still up. Use `vitastor-disk purge` to delete them.\n";
|
||||||
if (is_dataloss && !force_dataloss && !dry_run)
|
if (is_dataloss && !force_dataloss && !dry_run)
|
||||||
error += "OSDs not deleted. Please move data to other OSDs or bypass this check with --allow-data-loss if you know what you are doing.\n";
|
error += "OSDs not deleted. Please move data to other OSDs or bypass this check with --allow-data-loss if you know what you are doing.\n";
|
||||||
else if (is_warning && !force_warning && !dry_run)
|
else if (is_warning && !force_warning && !dry_run)
|
||||||
|
@ -463,6 +476,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
|
||||||
auto rm_osd = new rm_osd_t();
|
auto rm_osd = new rm_osd_t();
|
||||||
rm_osd->parent = this;
|
rm_osd->parent = this;
|
||||||
rm_osd->dry_run = cfg["dry_run"].bool_value();
|
rm_osd->dry_run = cfg["dry_run"].bool_value();
|
||||||
|
rm_osd->allow_up = cfg["allow_up"].bool_value();
|
||||||
rm_osd->force_dataloss = cfg["allow_data_loss"].bool_value();
|
rm_osd->force_dataloss = cfg["allow_data_loss"].bool_value();
|
||||||
rm_osd->force_warning = rm_osd->force_dataloss || cfg["force"].bool_value();
|
rm_osd->force_warning = rm_osd->force_dataloss || cfg["force"].bool_value();
|
||||||
if (!cfg["etcd_tx_retries"].is_null())
|
if (!cfg["etcd_tx_retries"].is_null())
|
||||||
|
|
|
@ -435,7 +435,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
|
||||||
printf("%s\n", json11::Json(result).dump().c_str());
|
printf("%s\n", json11::Json(result).dump().c_str());
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
std::vector<std::string> rm_osd_cli = { "vitastor-cli", "rm-osd" };
|
std::vector<std::string> rm_osd_cli = { "vitastor-cli", "rm-osd", "--allow-up" };
|
||||||
for (auto osd_num: osd_numbers)
|
for (auto osd_num: osd_numbers)
|
||||||
{
|
{
|
||||||
rm_osd_cli.push_back(std::to_string(osd_num));
|
rm_osd_cli.push_back(std::to_string(osd_num));
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
#include "vitastor_kv.h"
|
#include "vitastor_kv.h"
|
||||||
|
|
||||||
|
#define KV_LIST_BUF_SIZE 65536
|
||||||
|
|
||||||
const char *exe_name = NULL;
|
const char *exe_name = NULL;
|
||||||
|
|
||||||
class kv_cli_t
|
class kv_cli_t
|
||||||
|
@ -290,10 +292,26 @@ void kv_cli_t::next_cmd()
|
||||||
struct kv_cli_list_t
|
struct kv_cli_list_t
|
||||||
{
|
{
|
||||||
vitastorkv_dbw_t *db = NULL;
|
vitastorkv_dbw_t *db = NULL;
|
||||||
|
std::string buf;
|
||||||
void *handle = NULL;
|
void *handle = NULL;
|
||||||
int format = 0;
|
int format = 0;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
std::function<void(int)> cb;
|
std::function<void(int)> cb;
|
||||||
|
|
||||||
|
void write(const std::string & str)
|
||||||
|
{
|
||||||
|
if (buf.capacity() < KV_LIST_BUF_SIZE)
|
||||||
|
buf.reserve(KV_LIST_BUF_SIZE);
|
||||||
|
if (buf.size() + str.size() > buf.capacity())
|
||||||
|
flush();
|
||||||
|
buf.append(str.data(), str.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
void flush()
|
||||||
|
{
|
||||||
|
::write(1, buf.data(), buf.size());
|
||||||
|
buf.resize(0);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<std::string> kv_cli_t::parse_cmd(const std::string & str)
|
std::vector<std::string> kv_cli_t::parse_cmd(const std::string & str)
|
||||||
|
@ -604,11 +622,10 @@ void kv_cli_t::handle_cmd(const std::vector<std::string> & cmd, std::function<vo
|
||||||
if (res < 0)
|
if (res < 0)
|
||||||
{
|
{
|
||||||
if (res != -ENOENT)
|
if (res != -ENOENT)
|
||||||
{
|
|
||||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||||
}
|
|
||||||
if (lst->format == 2)
|
if (lst->format == 2)
|
||||||
printf("\n}\n");
|
lst->write("\n}\n");
|
||||||
|
lst->flush();
|
||||||
lst->db->list_close(lst->handle);
|
lst->db->list_close(lst->handle);
|
||||||
lst->cb(res == -ENOENT ? 0 : res);
|
lst->cb(res == -ENOENT ? 0 : res);
|
||||||
delete lst;
|
delete lst;
|
||||||
|
@ -616,11 +633,27 @@ void kv_cli_t::handle_cmd(const std::vector<std::string> & cmd, std::function<vo
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (lst->format == 2)
|
if (lst->format == 2)
|
||||||
printf(lst->n ? ",\n %s: %s" : "{\n %s: %s", addslashes(key).c_str(), addslashes(value).c_str());
|
{
|
||||||
|
lst->write(lst->n ? ",\n " : "{\n ");
|
||||||
|
lst->write(addslashes(key));
|
||||||
|
lst->write(": ");
|
||||||
|
lst->write(addslashes(value));
|
||||||
|
}
|
||||||
else if (lst->format == 1)
|
else if (lst->format == 1)
|
||||||
printf("set %s %s\n", auto_addslashes(key).c_str(), value.c_str());
|
{
|
||||||
|
lst->write("set ");
|
||||||
|
lst->write(auto_addslashes(key));
|
||||||
|
lst->write(" ");
|
||||||
|
lst->write(value);
|
||||||
|
lst->write("\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
printf("%s = %s\n", key.c_str(), value.c_str());
|
{
|
||||||
|
lst->write(key);
|
||||||
|
lst->write(" = ");
|
||||||
|
lst->write(value);
|
||||||
|
lst->write("\n");
|
||||||
|
}
|
||||||
lst->n++;
|
lst->n++;
|
||||||
lst->db->list_next(lst->handle, NULL);
|
lst->db->list_next(lst->handle, NULL);
|
||||||
}
|
}
|
||||||
|
|
|
@ -870,7 +870,7 @@ static void get_block(kv_db_t *db, uint64_t offset, int cur_level, int recheck_p
|
||||||
}
|
}
|
||||||
// Block already in cache, we can proceed
|
// Block already in cache, we can proceed
|
||||||
blk->usage = db->usage_counter;
|
blk->usage = db->usage_counter;
|
||||||
cb(0, BLK_UPDATING);
|
db->cli->msgr.ringloop->set_immediate([=] { cb(0, BLK_UPDATING); });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cluster_op_t *op = new cluster_op_t;
|
cluster_op_t *op = new cluster_op_t;
|
||||||
|
|
|
@ -22,8 +22,8 @@ int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
|
||||||
{
|
{
|
||||||
auto ttb = pst_it->second["total_raw_tb"].number_value();
|
auto ttb = pst_it->second["total_raw_tb"].number_value();
|
||||||
auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
|
auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
|
||||||
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
|
||||||
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
|
||||||
}
|
}
|
||||||
*reply = (FSSTAT3res){
|
*reply = (FSSTAT3res){
|
||||||
.status = NFS3_OK,
|
.status = NFS3_OK,
|
||||||
|
|
|
@ -210,6 +210,7 @@ resume_4:
|
||||||
st->res = res;
|
st->res = res;
|
||||||
kv_continue_create(st, 5);
|
kv_continue_create(st, 5);
|
||||||
});
|
});
|
||||||
|
return;
|
||||||
resume_5:
|
resume_5:
|
||||||
if (st->res < 0)
|
if (st->res < 0)
|
||||||
{
|
{
|
||||||
|
|
|
@ -13,6 +13,12 @@ void kv_read_inode(nfs_proxy_t *proxy, uint64_t ino,
|
||||||
std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
|
std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
|
||||||
bool allow_cache)
|
bool allow_cache)
|
||||||
{
|
{
|
||||||
|
if (!ino)
|
||||||
|
{
|
||||||
|
// Zero value can not exist
|
||||||
|
cb(-ENOENT, "", json11::Json());
|
||||||
|
return;
|
||||||
|
}
|
||||||
auto key = kv_inode_key(ino);
|
auto key = kv_inode_key(ino);
|
||||||
proxy->db->get(key, [=](int res, const std::string & value)
|
proxy->db->get(key, [=](int res, const std::string & value)
|
||||||
{
|
{
|
||||||
|
@ -49,7 +55,7 @@ int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
||||||
auto ino = kv_fh_inode(fh);
|
auto ino = kv_fh_inode(fh);
|
||||||
if (self->parent->trace)
|
if (self->parent->trace)
|
||||||
fprintf(stderr, "[%d] GETATTR %ju\n", self->nfs_fd, ino);
|
fprintf(stderr, "[%d] GETATTR %ju\n", self->nfs_fd, ino);
|
||||||
if (!kv_fh_valid(fh))
|
if (!kv_fh_valid(fh) || !ino)
|
||||||
{
|
{
|
||||||
*reply = (GETATTR3res){ .status = NFS3ERR_INVAL };
|
*reply = (GETATTR3res){ .status = NFS3ERR_INVAL };
|
||||||
rpc_queue_reply(rop);
|
rpc_queue_reply(rop);
|
||||||
|
|
|
@ -43,9 +43,30 @@ int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
|
||||||
uint64_t ino = direntry["ino"].uint64_value();
|
uint64_t ino = direntry["ino"].uint64_value();
|
||||||
kv_read_inode(self->parent, ino, [=](int res, const std::string & value, json11::Json ientry)
|
kv_read_inode(self->parent, ino, [=](int res, const std::string & value, json11::Json ientry)
|
||||||
{
|
{
|
||||||
if (res < 0)
|
if (res == -ENOENT)
|
||||||
{
|
{
|
||||||
*reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(res == -ENOENT ? -EIO : res) };
|
*reply = (LOOKUP3res){
|
||||||
|
.status = NFS3_OK,
|
||||||
|
.resok = (LOOKUP3resok){
|
||||||
|
.object = xdr_copy_string(rop->xdrs, kv_fh(ino)),
|
||||||
|
.obj_attributes = {
|
||||||
|
.attributes_follow = 1,
|
||||||
|
.attributes = (fattr3){
|
||||||
|
.type = (ftype3)0,
|
||||||
|
.mode = 0666,
|
||||||
|
.nlink = 1,
|
||||||
|
.fsid = self->parent->fsid,
|
||||||
|
.fileid = ino,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
rpc_queue_reply(rop);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else if (res < 0)
|
||||||
|
{
|
||||||
|
*reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(res) };
|
||||||
rpc_queue_reply(rop);
|
rpc_queue_reply(rop);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,6 +88,15 @@ resume_1:
|
||||||
return;
|
return;
|
||||||
resume_2:
|
resume_2:
|
||||||
if (st->res < 0)
|
if (st->res < 0)
|
||||||
|
{
|
||||||
|
if (st->res == -ENOENT)
|
||||||
|
{
|
||||||
|
// Just delete direntry and skip inode
|
||||||
|
fprintf(stderr, "direntry %s references a non-existing inode %ju, deleting\n",
|
||||||
|
kv_direntry_key(st->dir_ino, st->filename).c_str(), st->ino);
|
||||||
|
st->ino = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error reading inode %s: %s (code %d)\n",
|
fprintf(stderr, "error reading inode %s: %s (code %d)\n",
|
||||||
kv_inode_key(st->ino).c_str(), strerror(-st->res), st->res);
|
kv_inode_key(st->ino).c_str(), strerror(-st->res), st->res);
|
||||||
|
@ -95,6 +104,8 @@ resume_2:
|
||||||
cb(st->res);
|
cb(st->res);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
std::string err;
|
std::string err;
|
||||||
st->ientry = json11::Json::parse(st->ientry_text, err);
|
st->ientry = json11::Json::parse(st->ientry_text, err);
|
||||||
|
|
|
@ -271,6 +271,12 @@ void osd_t::parse_config(bool init)
|
||||||
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
||||||
if (!inode_vanish_time)
|
if (!inode_vanish_time)
|
||||||
inode_vanish_time = 60;
|
inode_vanish_time = 60;
|
||||||
|
enable_pg_locks = config["enable_pg_locks"].is_null() || json_is_true(config["enable_pg_locks"]);
|
||||||
|
bool old_pg_locks_localize_only = pg_locks_localize_only;
|
||||||
|
pg_locks_localize_only = config["enable_pg_locks"].is_null();
|
||||||
|
pg_lock_retry_interval_ms = config["pg_lock_retry_interval"].uint64_value();
|
||||||
|
if (pg_lock_retry_interval_ms <= 1)
|
||||||
|
pg_lock_retry_interval_ms = 100;
|
||||||
auto old_auto_scrub = auto_scrub;
|
auto old_auto_scrub = auto_scrub;
|
||||||
auto_scrub = json_is_true(config["auto_scrub"]);
|
auto_scrub = json_is_true(config["auto_scrub"]);
|
||||||
global_scrub_interval = parse_time(config["scrub_interval"].string_value());
|
global_scrub_interval = parse_time(config["scrub_interval"].string_value());
|
||||||
|
@ -336,6 +342,10 @@ void osd_t::parse_config(bool init)
|
||||||
{
|
{
|
||||||
apply_recovery_tune_interval();
|
apply_recovery_tune_interval();
|
||||||
}
|
}
|
||||||
|
if (old_pg_locks_localize_only != pg_locks_localize_only)
|
||||||
|
{
|
||||||
|
apply_pg_locks_localize_only();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::bind_socket()
|
void osd_t::bind_socket()
|
||||||
|
@ -447,6 +457,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||||
}
|
}
|
||||||
if (readonly &&
|
if (readonly &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ &&
|
cur_op->req.hdr.opcode != OSD_OP_SEC_READ &&
|
||||||
|
cur_op->req.hdr.opcode != OSD_OP_SEC_LOCK &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
||||||
|
|
|
@ -92,6 +92,12 @@ struct recovery_stat_t
|
||||||
uint64_t count, usec, bytes;
|
uint64_t count, usec, bytes;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct osd_pg_lock_t
|
||||||
|
{
|
||||||
|
osd_num_t primary_osd = 0;
|
||||||
|
uint64_t state = 0;
|
||||||
|
};
|
||||||
|
|
||||||
class osd_t
|
class osd_t
|
||||||
{
|
{
|
||||||
// config
|
// config
|
||||||
|
@ -140,6 +146,9 @@ class osd_t
|
||||||
uint32_t scrub_list_limit = 1000;
|
uint32_t scrub_list_limit = 1000;
|
||||||
bool scrub_find_best = true;
|
bool scrub_find_best = true;
|
||||||
uint64_t scrub_ec_max_bruteforce = 100;
|
uint64_t scrub_ec_max_bruteforce = 100;
|
||||||
|
bool enable_pg_locks = false;
|
||||||
|
bool pg_locks_localize_only = false;
|
||||||
|
uint64_t pg_lock_retry_interval_ms = 100;
|
||||||
|
|
||||||
// cluster state
|
// cluster state
|
||||||
|
|
||||||
|
@ -159,6 +168,7 @@ class osd_t
|
||||||
|
|
||||||
// peers and PGs
|
// peers and PGs
|
||||||
|
|
||||||
|
std::map<pool_pg_num_t, osd_pg_lock_t> pg_locks;
|
||||||
std::map<pool_id_t, pg_num_t> pg_counts;
|
std::map<pool_id_t, pg_num_t> pg_counts;
|
||||||
std::map<pool_pg_num_t, pg_t> pgs;
|
std::map<pool_pg_num_t, pg_t> pgs;
|
||||||
std::set<pool_pg_num_t> dirty_pgs;
|
std::set<pool_pg_num_t> dirty_pgs;
|
||||||
|
@ -239,6 +249,8 @@ class osd_t
|
||||||
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
|
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
|
||||||
void on_load_config_hook(json11::Json::object & changes);
|
void on_load_config_hook(json11::Json::object & changes);
|
||||||
void on_reload_config_hook(json11::Json::object & changes);
|
void on_reload_config_hook(json11::Json::object & changes);
|
||||||
|
void on_change_pool_config_hook();
|
||||||
|
void apply_pg_locks_localize_only();
|
||||||
json11::Json on_load_pgs_checks_hook();
|
json11::Json on_load_pgs_checks_hook();
|
||||||
void on_load_pgs_hook(bool success);
|
void on_load_pgs_hook(bool success);
|
||||||
void bind_socket();
|
void bind_socket();
|
||||||
|
@ -266,13 +278,19 @@ class osd_t
|
||||||
void handle_peers();
|
void handle_peers();
|
||||||
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
||||||
void repeer_pgs(osd_num_t osd_num);
|
void repeer_pgs(osd_num_t osd_num);
|
||||||
|
void repeer_pg(pg_t & pg);
|
||||||
void start_pg_peering(pg_t & pg);
|
void start_pg_peering(pg_t & pg);
|
||||||
void drop_dirty_pg_connections(pool_pg_num_t pg);
|
void drop_dirty_pg_connections(pool_pg_num_t pg);
|
||||||
|
void record_pg_lock(pg_t & pg, osd_num_t peer_osd, uint64_t pg_state);
|
||||||
|
void relock_pg(pg_t & pg);
|
||||||
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
||||||
void discard_list_subop(osd_op_t *list_op);
|
void discard_list_subop(osd_op_t *list_op);
|
||||||
bool stop_pg(pg_t & pg);
|
bool stop_pg(pg_t & pg);
|
||||||
void reset_pg(pg_t & pg);
|
void reset_pg(pg_t & pg);
|
||||||
void finish_stop_pg(pg_t & pg);
|
void finish_stop_pg(pg_t & pg);
|
||||||
|
void rm_inflight(pg_t & pg);
|
||||||
|
void continue_pg(pg_t & pg);
|
||||||
|
bool continue_pg_peering(pg_t & pg);
|
||||||
|
|
||||||
// flushing, recovery and backfill
|
// flushing, recovery and backfill
|
||||||
void submit_pg_flush_ops(pg_t & pg);
|
void submit_pg_flush_ops(pg_t & pg);
|
||||||
|
@ -299,10 +317,13 @@ class osd_t
|
||||||
void finish_op(osd_op_t *cur_op, int retval);
|
void finish_op(osd_op_t *cur_op, int retval);
|
||||||
|
|
||||||
// secondary ops
|
// secondary ops
|
||||||
|
bool sec_check_pg_lock(osd_num_t primary_osd, const object_id &oid);
|
||||||
void exec_sync_stab_all(osd_op_t *cur_op);
|
void exec_sync_stab_all(osd_op_t *cur_op);
|
||||||
void exec_show_config(osd_op_t *cur_op);
|
void exec_show_config(osd_op_t *cur_op);
|
||||||
void exec_secondary(osd_op_t *cur_op);
|
void exec_secondary(osd_op_t *cur_op);
|
||||||
void exec_secondary_real(osd_op_t *cur_op);
|
void exec_secondary_real(osd_op_t *cur_op);
|
||||||
|
void exec_sec_read_bmp(osd_op_t *cur_op);
|
||||||
|
void exec_sec_lock(osd_op_t *cur_op);
|
||||||
void secondary_op_callback(osd_op_t *cur_op);
|
void secondary_op_callback(osd_op_t *cur_op);
|
||||||
|
|
||||||
// primary ops
|
// primary ops
|
||||||
|
@ -310,6 +331,7 @@ class osd_t
|
||||||
bool prepare_primary_rw(osd_op_t *cur_op);
|
bool prepare_primary_rw(osd_op_t *cur_op);
|
||||||
void continue_primary_read(osd_op_t *cur_op);
|
void continue_primary_read(osd_op_t *cur_op);
|
||||||
void continue_primary_scrub(osd_op_t *cur_op);
|
void continue_primary_scrub(osd_op_t *cur_op);
|
||||||
|
void continue_local_secondary_read(osd_op_t *cur_op);
|
||||||
void continue_primary_describe(osd_op_t *cur_op);
|
void continue_primary_describe(osd_op_t *cur_op);
|
||||||
void continue_primary_list(osd_op_t *cur_op);
|
void continue_primary_list(osd_op_t *cur_op);
|
||||||
void continue_primary_write(osd_op_t *cur_op);
|
void continue_primary_write(osd_op_t *cur_op);
|
||||||
|
@ -347,13 +369,13 @@ class osd_t
|
||||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
|
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
|
||||||
|
|
||||||
void continue_chained_read(osd_op_t *cur_op);
|
void continue_chained_read(osd_op_t *cur_op);
|
||||||
int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
|
int submit_chained_read_requests(pg_t *pg, osd_op_t *cur_op);
|
||||||
void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
|
void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
|
||||||
void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
|
void send_chained_read_results(pg_t *pg, osd_op_t *cur_op);
|
||||||
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
|
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
|
||||||
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
|
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
|
||||||
int submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg);
|
int submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg);
|
||||||
int read_bitmaps(osd_op_t *cur_op, pg_t & pg, int base_state);
|
int read_bitmaps(osd_op_t *cur_op, pg_t *pg, int base_state);
|
||||||
|
|
||||||
inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
|
inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
|
||||||
{
|
{
|
||||||
|
|
|
@ -65,6 +65,7 @@ void osd_t::init_cluster()
|
||||||
st_cli.tfd = tfd;
|
st_cli.tfd = tfd;
|
||||||
st_cli.log_level = log_level;
|
st_cli.log_level = log_level;
|
||||||
st_cli.on_change_osd_state_hook = [this](osd_num_t peer_osd) { on_change_osd_state_hook(peer_osd); };
|
st_cli.on_change_osd_state_hook = [this](osd_num_t peer_osd) { on_change_osd_state_hook(peer_osd); };
|
||||||
|
st_cli.on_change_pool_config_hook = [this]() { on_change_pool_config_hook(); };
|
||||||
st_cli.on_change_backfillfull_hook = [this](pool_id_t pool_id) { on_change_backfillfull_hook(pool_id); };
|
st_cli.on_change_backfillfull_hook = [this](pool_id_t pool_id) { on_change_backfillfull_hook(pool_id); };
|
||||||
st_cli.on_change_pg_history_hook = [this](pool_id_t pool_id, pg_num_t pg_num) { on_change_pg_history_hook(pool_id, pg_num); };
|
st_cli.on_change_pg_history_hook = [this](pool_id_t pool_id, pg_num_t pg_num) { on_change_pg_history_hook(pool_id, pg_num); };
|
||||||
st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_etcd_state_hook(changes); };
|
st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_etcd_state_hook(changes); };
|
||||||
|
@ -153,23 +154,19 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
cl->enable_pg_locks = conf["features"]["pg_locks"].bool_value();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
json11::Json osd_t::get_osd_state()
|
json11::Json osd_t::get_osd_state()
|
||||||
{
|
{
|
||||||
std::vector<char> hostname;
|
|
||||||
hostname.resize(1024);
|
|
||||||
while (gethostname(hostname.data(), hostname.size()) < 0 && errno == ENAMETOOLONG)
|
|
||||||
hostname.resize(hostname.size()+1024);
|
|
||||||
hostname.resize(strnlen(hostname.data(), hostname.size()));
|
|
||||||
json11::Json::object st;
|
json11::Json::object st;
|
||||||
st["state"] = "up";
|
st["state"] = "up";
|
||||||
if (bind_addresses.size() != 1 || bind_addresses[0] != "0.0.0.0")
|
if (bind_addresses.size() != 1 || bind_addresses[0] != "0.0.0.0")
|
||||||
st["addresses"] = bind_addresses;
|
st["addresses"] = bind_addresses;
|
||||||
else
|
else
|
||||||
st["addresses"] = getifaddr_list();
|
st["addresses"] = getifaddr_list();
|
||||||
st["host"] = std::string(hostname.data(), hostname.size());
|
st["host"] = gethostname_str();
|
||||||
st["version"] = VITASTOR_VERSION;
|
st["version"] = VITASTOR_VERSION;
|
||||||
st["port"] = listening_port;
|
st["port"] = listening_port;
|
||||||
#ifdef WITH_RDMACM
|
#ifdef WITH_RDMACM
|
||||||
|
@ -419,6 +416,35 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_t::on_change_pool_config_hook()
|
||||||
|
{
|
||||||
|
apply_pg_locks_localize_only();
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::apply_pg_locks_localize_only()
|
||||||
|
{
|
||||||
|
for (auto & pp: pgs)
|
||||||
|
{
|
||||||
|
auto pool_it = st_cli.pool_config.find(pp.first.pool_id);
|
||||||
|
if (pool_it == st_cli.pool_config.end())
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto & pool_cfg = pool_it->second;
|
||||||
|
auto & pg = pp.second;
|
||||||
|
auto old_disable_pg_locks = pg.disable_pg_locks;
|
||||||
|
pg.disable_pg_locks = pg_locks_localize_only &&
|
||||||
|
pool_cfg.scheme == POOL_SCHEME_REPLICATED &&
|
||||||
|
pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY;
|
||||||
|
if (!pg.disable_pg_locks && old_disable_pg_locks)
|
||||||
|
{
|
||||||
|
// Relock PG
|
||||||
|
printf("[PG %u/%u] Repeer to enable PG locks\n", pg.pool_id, pg.pg_num);
|
||||||
|
repeer_pg(pg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void osd_t::on_change_backfillfull_hook(pool_id_t pool_id)
|
void osd_t::on_change_backfillfull_hook(pool_id_t pool_id)
|
||||||
{
|
{
|
||||||
if (!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
if (!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
||||||
|
@ -696,20 +722,27 @@ void osd_t::apply_pg_count()
|
||||||
// The external tool must wait for all PGs to come down before changing PG count
|
// The external tool must wait for all PGs to come down before changing PG count
|
||||||
// If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
|
// If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
|
||||||
// So an OSD just dies if it detects PG count change while there are active PGs
|
// So an OSD just dies if it detects PG count change while there are active PGs
|
||||||
int still_active = 0;
|
int still_active_primary = 0;
|
||||||
for (auto & kv: pgs)
|
for (auto & kv: pgs)
|
||||||
{
|
{
|
||||||
if (kv.first.pool_id == pool_item.first && (kv.second.state & PG_ACTIVE))
|
if (kv.first.pool_id == pool_item.first && (kv.second.state & PG_ACTIVE))
|
||||||
{
|
{
|
||||||
still_active++;
|
still_active_primary++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (still_active > 0)
|
int still_active_secondary = 0;
|
||||||
|
for (auto lock_it = pg_locks.lower_bound((pool_pg_num_t){ .pool_id = pool_item.first, .pg_num = 0 });
|
||||||
|
lock_it != pg_locks.end() && lock_it->first.pool_id == pool_item.first; lock_it++)
|
||||||
|
{
|
||||||
|
still_active_secondary++;
|
||||||
|
}
|
||||||
|
if (still_active_primary > 0 || still_active_secondary > 0)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %ju] PG count change detected for pool %u (new is %ju, old is %u),"
|
"[OSD %ju] PG count change detected for pool %u (new is %ju, old is %u),"
|
||||||
" but %u PG(s) are still active. This is not allowed. Exiting\n",
|
" but %u PG(s) are still active as primary and %u as secondary. This is not allowed. Exiting\n",
|
||||||
this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
|
this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first],
|
||||||
|
still_active_primary, still_active_secondary
|
||||||
);
|
);
|
||||||
force_stop(1);
|
force_stop(1);
|
||||||
return;
|
return;
|
||||||
|
@ -836,22 +869,23 @@ void osd_t::apply_pg_config()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto & pg = this->pgs[{ .pool_id = pool_id, .pg_num = pg_num }];
|
auto & pg = this->pgs[{ .pool_id = pool_id, .pg_num = pg_num }];
|
||||||
pg = (pg_t){
|
pg.state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING;
|
||||||
.state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING,
|
pg.scheme = pool_item.second.scheme;
|
||||||
.scheme = pool_item.second.scheme,
|
pg.pg_cursize = 0;
|
||||||
.pg_cursize = 0,
|
pg.pg_size = pool_item.second.pg_size;
|
||||||
.pg_size = pool_item.second.pg_size,
|
pg.pg_minsize = pool_item.second.pg_minsize;
|
||||||
.pg_minsize = pool_item.second.pg_minsize,
|
pg.pg_data_size = pool_item.second.scheme == POOL_SCHEME_REPLICATED
|
||||||
.pg_data_size = pool_item.second.scheme == POOL_SCHEME_REPLICATED
|
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks;
|
||||||
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
|
pg.pool_id = pool_id;
|
||||||
.pool_id = pool_id,
|
pg.pg_num = pg_num;
|
||||||
.pg_num = pg_num,
|
pg.reported_epoch = pg_cfg.epoch;
|
||||||
.reported_epoch = pg_cfg.epoch,
|
pg.target_history = pg_cfg.target_history;
|
||||||
.target_history = pg_cfg.target_history,
|
pg.all_peers = vec_all_peers;
|
||||||
.all_peers = vec_all_peers,
|
pg.next_scrub = pg_cfg.next_scrub;
|
||||||
.next_scrub = pg_cfg.next_scrub,
|
pg.target_set = pg_cfg.target_set;
|
||||||
.target_set = pg_cfg.target_set,
|
pg.disable_pg_locks = pg_locks_localize_only &&
|
||||||
};
|
pool_item.second.scheme == POOL_SCHEME_REPLICATED &&
|
||||||
|
pool_item.second.local_reads == POOL_LOCAL_READ_PRIMARY;
|
||||||
if (pg.scheme == POOL_SCHEME_EC)
|
if (pg.scheme == POOL_SCHEME_EC)
|
||||||
{
|
{
|
||||||
use_ec(pg.pg_size, pg.pg_data_size, true);
|
use_ec(pg.pg_size, pg.pg_data_size, true);
|
||||||
|
|
|
@ -150,14 +150,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||||
{
|
{
|
||||||
continue_primary_write(op);
|
continue_primary_write(op);
|
||||||
}
|
}
|
||||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
continue_pg(pg);
|
||||||
{
|
|
||||||
finish_stop_pg(pg);
|
|
||||||
}
|
|
||||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
|
||||||
{
|
|
||||||
start_pg_peering(pg);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -209,7 +202,6 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||||
.sec_stab = {
|
.sec_stab = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE),
|
.opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE),
|
||||||
},
|
},
|
||||||
.len = count * sizeof(obj_ver_id),
|
.len = count * sizeof(obj_ver_id),
|
||||||
|
@ -255,7 +247,8 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||||
restart:
|
restart:
|
||||||
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
|
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
|
||||||
{
|
{
|
||||||
if ((pg_it->second.state & mask) == check)
|
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
|
||||||
|
if ((pg_it->second.state & mask) == check && src.size() > 0)
|
||||||
{
|
{
|
||||||
auto pool_it = st_cli.pool_config.find(pg_it->first.pool_id);
|
auto pool_it = st_cli.pool_config.find(pg_it->first.pool_id);
|
||||||
if (pool_it != st_cli.pool_config.end() && pool_it->second.backfillfull)
|
if (pool_it != st_cli.pool_config.end() && pool_it->second.backfillfull)
|
||||||
|
@ -264,8 +257,6 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||||
recovery_last_pg.pool_id++;
|
recovery_last_pg.pool_id++;
|
||||||
goto restart;
|
goto restart;
|
||||||
}
|
}
|
||||||
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
|
|
||||||
assert(src.size() > 0);
|
|
||||||
// Restart scanning from the next object
|
// Restart scanning from the next object
|
||||||
for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
|
for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
|
||||||
{
|
{
|
||||||
|
|
|
@ -21,28 +21,8 @@ void osd_t::handle_peers()
|
||||||
{
|
{
|
||||||
if (p.second.state == PG_PEERING)
|
if (p.second.state == PG_PEERING)
|
||||||
{
|
{
|
||||||
if (!p.second.peering_state->list_ops.size())
|
if (continue_pg_peering(p.second))
|
||||||
{
|
{
|
||||||
p.second.calc_object_states(log_level);
|
|
||||||
report_pg_state(p.second);
|
|
||||||
schedule_scrub(p.second);
|
|
||||||
incomplete_objects += p.second.incomplete_objects.size();
|
|
||||||
misplaced_objects += p.second.misplaced_objects.size();
|
|
||||||
// FIXME: degraded objects may currently include misplaced, too! Report them separately?
|
|
||||||
degraded_objects += p.second.degraded_objects.size();
|
|
||||||
if (p.second.state & PG_HAS_UNCLEAN)
|
|
||||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
|
||||||
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
|
||||||
{
|
|
||||||
peering_state = peering_state | OSD_RECOVERING;
|
|
||||||
if (p.second.state & PG_HAS_DEGRADED)
|
|
||||||
{
|
|
||||||
// Restart recovery from degraded objects
|
|
||||||
recovery_last_degraded = true;
|
|
||||||
recovery_last_pg = {};
|
|
||||||
recovery_last_oid = {};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ringloop->wakeup();
|
ringloop->wakeup();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -95,6 +75,16 @@ void osd_t::handle_peers()
|
||||||
|
|
||||||
void osd_t::repeer_pgs(osd_num_t peer_osd)
|
void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||||
{
|
{
|
||||||
|
if (msgr.osd_peer_fds.find(peer_osd) == msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
for (auto lock_it = pg_locks.begin(); lock_it != pg_locks.end(); )
|
||||||
|
{
|
||||||
|
if (lock_it->second.primary_osd == peer_osd)
|
||||||
|
pg_locks.erase(lock_it++);
|
||||||
|
else
|
||||||
|
lock_it++;
|
||||||
|
}
|
||||||
|
}
|
||||||
// Re-peer affected PGs
|
// Re-peer affected PGs
|
||||||
for (auto & p: pgs)
|
for (auto & p: pgs)
|
||||||
{
|
{
|
||||||
|
@ -114,7 +104,15 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||||
{
|
{
|
||||||
// Repeer this pg
|
// Repeer this pg
|
||||||
printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
|
printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
|
||||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
|
repeer_pg(pg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::repeer_pg(pg_t & pg)
|
||||||
|
{
|
||||||
|
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
|
||||||
{
|
{
|
||||||
start_pg_peering(pg);
|
start_pg_peering(pg);
|
||||||
}
|
}
|
||||||
|
@ -125,9 +123,6 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||||
report_pg_state(pg);
|
report_pg_state(pg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reset PG state (when peering or stopping)
|
// Reset PG state (when peering or stopping)
|
||||||
void osd_t::reset_pg(pg_t & pg)
|
void osd_t::reset_pg(pg_t & pg)
|
||||||
|
@ -195,7 +190,6 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
pg.state = PG_PEERING;
|
pg.state = PG_PEERING;
|
||||||
this->peering_state |= OSD_PEERING_PGS;
|
this->peering_state |= OSD_PEERING_PGS;
|
||||||
reset_pg(pg);
|
reset_pg(pg);
|
||||||
report_pg_state(pg);
|
|
||||||
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||||
// Try to connect with current peers if they're up, but we don't have connections to them
|
// Try to connect with current peers if they're up, but we don't have connections to them
|
||||||
// Otherwise we may erroneously decide that the pg is incomplete :-)
|
// Otherwise we may erroneously decide that the pg is incomplete :-)
|
||||||
|
@ -215,8 +209,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
{
|
{
|
||||||
// Wait until all OSDs are either connected or their /osd/state disappears from etcd
|
// Wait until all OSDs are either connected or their /osd/state disappears from etcd
|
||||||
pg.state = PG_INCOMPLETE;
|
pg.state = PG_INCOMPLETE;
|
||||||
report_pg_state(pg);
|
// Fall through to cleanup list results
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
// Calculate current write OSD set
|
// Calculate current write OSD set
|
||||||
pg.pg_cursize = 0;
|
pg.pg_cursize = 0;
|
||||||
|
@ -242,8 +235,6 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
// because such PGs don't flush unstable entries on secondary OSDs so they can't remove these
|
// because such PGs don't flush unstable entries on secondary OSDs so they can't remove these
|
||||||
// entries from their journals...
|
// entries from their journals...
|
||||||
pg.state = PG_INCOMPLETE;
|
pg.state = PG_INCOMPLETE;
|
||||||
report_pg_state(pg);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
std::set<osd_num_t> cur_peers;
|
std::set<osd_num_t> cur_peers;
|
||||||
std::set<osd_num_t> dead_peers;
|
std::set<osd_num_t> dead_peers;
|
||||||
|
@ -278,8 +269,6 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
if (nonzero >= pg.pg_data_size && found < pg.pg_data_size)
|
if (nonzero >= pg.pg_data_size && found < pg.pg_data_size)
|
||||||
{
|
{
|
||||||
pg.state = PG_INCOMPLETE;
|
pg.state = PG_INCOMPLETE;
|
||||||
report_pg_state(pg);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -318,6 +307,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
delete pg.peering_state;
|
delete pg.peering_state;
|
||||||
pg.peering_state = NULL;
|
pg.peering_state = NULL;
|
||||||
}
|
}
|
||||||
|
report_pg_state(pg);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!pg.peering_state)
|
if (!pg.peering_state)
|
||||||
|
@ -326,8 +316,22 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
pg.peering_state->pool_id = pg.pool_id;
|
pg.peering_state->pool_id = pg.pool_id;
|
||||||
pg.peering_state->pg_num = pg.pg_num;
|
pg.peering_state->pg_num = pg.pg_num;
|
||||||
}
|
}
|
||||||
for (osd_num_t peer_osd: cur_peers)
|
pg.peering_state->locked = false;
|
||||||
|
pg.peering_state->lists_done = false;
|
||||||
|
report_pg_state(pg);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool osd_t::continue_pg_peering(pg_t & pg)
|
||||||
{
|
{
|
||||||
|
if (pg.peering_state->locked)
|
||||||
|
{
|
||||||
|
pg.peering_state->lists_done = true;
|
||||||
|
for (osd_num_t peer_osd: pg.cur_peers)
|
||||||
|
{
|
||||||
|
if (pg.peering_state->list_results.find(peer_osd) == pg.peering_state->list_results.end())
|
||||||
|
{
|
||||||
|
pg.peering_state->lists_done = false;
|
||||||
|
}
|
||||||
if (pg.peering_state->list_ops.find(peer_osd) != pg.peering_state->list_ops.end() ||
|
if (pg.peering_state->list_ops.find(peer_osd) != pg.peering_state->list_ops.end() ||
|
||||||
pg.peering_state->list_results.find(peer_osd) != pg.peering_state->list_results.end())
|
pg.peering_state->list_results.find(peer_osd) != pg.peering_state->list_results.end())
|
||||||
{
|
{
|
||||||
|
@ -335,7 +339,180 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||||
}
|
}
|
||||||
submit_list_subop(peer_osd, pg.peering_state);
|
submit_list_subop(peer_osd, pg.peering_state);
|
||||||
}
|
}
|
||||||
ringloop->wakeup();
|
}
|
||||||
|
if (pg.peering_state->lists_done)
|
||||||
|
{
|
||||||
|
pg.calc_object_states(log_level);
|
||||||
|
report_pg_state(pg);
|
||||||
|
schedule_scrub(pg);
|
||||||
|
incomplete_objects += pg.incomplete_objects.size();
|
||||||
|
misplaced_objects += pg.misplaced_objects.size();
|
||||||
|
// FIXME: degraded objects may currently include misplaced, too! Report them separately?
|
||||||
|
degraded_objects += pg.degraded_objects.size();
|
||||||
|
if (pg.state & PG_HAS_UNCLEAN)
|
||||||
|
this->peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||||
|
else if (pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||||
|
{
|
||||||
|
this->peering_state = peering_state | OSD_RECOVERING;
|
||||||
|
if (pg.state & PG_HAS_DEGRADED)
|
||||||
|
{
|
||||||
|
// Restart recovery from degraded objects
|
||||||
|
this->recovery_last_degraded = true;
|
||||||
|
this->recovery_last_pg = {};
|
||||||
|
this->recovery_last_oid = {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::record_pg_lock(pg_t & pg, osd_num_t peer_osd, uint64_t pg_state)
|
||||||
|
{
|
||||||
|
if (!pg_state)
|
||||||
|
pg.lock_peers.erase(peer_osd);
|
||||||
|
else
|
||||||
|
pg.lock_peers[peer_osd] = pg_state;
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::relock_pg(pg_t & pg)
|
||||||
|
{
|
||||||
|
if (!enable_pg_locks || pg.disable_pg_locks && !pg.lock_peers.size())
|
||||||
|
{
|
||||||
|
if (pg.state & PG_PEERING)
|
||||||
|
pg.peering_state->locked = true;
|
||||||
|
continue_pg(pg);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (pg.inflight_locks > 0 || pg.lock_waiting)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Check that lock_peers are equal to cur_peers and correct the difference, if any
|
||||||
|
uint64_t wanted_state = pg.state;
|
||||||
|
std::vector<osd_num_t> diff_osds;
|
||||||
|
if (!(pg.state & (PG_STOPPING | PG_OFFLINE | PG_INCOMPLETE)) && !pg.disable_pg_locks)
|
||||||
|
{
|
||||||
|
for (osd_num_t peer_osd: pg.cur_peers)
|
||||||
|
{
|
||||||
|
if (peer_osd != this->osd_num)
|
||||||
|
{
|
||||||
|
auto lock_it = pg.lock_peers.find(peer_osd);
|
||||||
|
if (lock_it == pg.lock_peers.end())
|
||||||
|
diff_osds.push_back(peer_osd);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (lock_it->second != wanted_state)
|
||||||
|
diff_osds.push_back(peer_osd);
|
||||||
|
lock_it->second |= ((uint64_t)1 << 63);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int relock_osd_count = diff_osds.size();
|
||||||
|
for (auto & lp: pg.lock_peers)
|
||||||
|
{
|
||||||
|
if (!(lp.second & ((uint64_t)1 << 63)))
|
||||||
|
diff_osds.push_back(lp.first);
|
||||||
|
lp.second &= ~((uint64_t)1 << 63);
|
||||||
|
}
|
||||||
|
if (!diff_osds.size())
|
||||||
|
{
|
||||||
|
if (pg.state & PG_PEERING)
|
||||||
|
pg.peering_state->locked = true;
|
||||||
|
continue_pg(pg);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
pg.inflight_locks++;
|
||||||
|
for (int i = 0; i < diff_osds.size(); i++)
|
||||||
|
{
|
||||||
|
bool unlock_peer = (i >= relock_osd_count);
|
||||||
|
uint64_t new_state = unlock_peer ? 0 : pg.state;
|
||||||
|
auto peer_osd = diff_osds[i];
|
||||||
|
auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||||
|
if (peer_fd_it == msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
if (unlock_peer)
|
||||||
|
{
|
||||||
|
// Peer is dead - unlocked automatically
|
||||||
|
record_pg_lock(pg, peer_osd, new_state);
|
||||||
|
diff_osds.erase(diff_osds.begin()+(i--));
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int peer_fd = peer_fd_it->second;
|
||||||
|
auto cl = msgr.clients.at(peer_fd);
|
||||||
|
if (!cl->enable_pg_locks)
|
||||||
|
{
|
||||||
|
// Peer does not support locking - just instantly remember the lock as successful
|
||||||
|
record_pg_lock(pg, peer_osd, new_state);
|
||||||
|
diff_osds.erase(diff_osds.begin()+(i--));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
pg.inflight_locks++;
|
||||||
|
osd_op_t *op = new osd_op_t();
|
||||||
|
op->op_type = OSD_OP_OUT;
|
||||||
|
op->peer_fd = peer_fd;
|
||||||
|
op->req = (osd_any_op_t){
|
||||||
|
.sec_lock = {
|
||||||
|
.header = {
|
||||||
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
|
.opcode = OSD_OP_SEC_LOCK,
|
||||||
|
},
|
||||||
|
.flags = (uint64_t)(unlock_peer ? OSD_SEC_UNLOCK_PG : OSD_SEC_LOCK_PG),
|
||||||
|
.pool_id = pg.pool_id,
|
||||||
|
.pg_num = pg.pg_num,
|
||||||
|
.pg_state = new_state,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
op->callback = [this, peer_osd](osd_op_t *op)
|
||||||
|
{
|
||||||
|
pool_pg_num_t pg_id = { .pool_id = (pool_id_t)op->req.sec_lock.pool_id, .pg_num = (pg_num_t)op->req.sec_lock.pg_num };
|
||||||
|
auto pg_it = pgs.find(pg_id);
|
||||||
|
if (pg_it == pgs.end())
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto & pg = pg_it->second;
|
||||||
|
if (op->reply.hdr.retval == 0)
|
||||||
|
{
|
||||||
|
record_pg_lock(pg_it->second, peer_osd, op->req.sec_lock.pg_state);
|
||||||
|
}
|
||||||
|
else if (op->reply.hdr.retval != -EPIPE)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
(op->reply.hdr.retval == -ENOENT
|
||||||
|
? "Failed to %1$s PG %2$u/%3$u on OSD %4$ju - peer didn't load PG info yet\n"
|
||||||
|
: (op->reply.sec_lock.cur_primary
|
||||||
|
? "Failed to %1$s PG %2$u/%3$u on OSD %4$ju - taken by OSD %6$ju (retval=%5$jd)\n"
|
||||||
|
: "Failed to %1$s PG %2$u/%3$u on OSD %4$ju - retval=%5$jd\n")),
|
||||||
|
op->req.sec_lock.flags == OSD_SEC_UNLOCK_PG ? "unlock" : "lock",
|
||||||
|
pg_id.pool_id, pg_id.pg_num, peer_osd, op->reply.hdr.retval, op->reply.sec_lock.cur_primary
|
||||||
|
);
|
||||||
|
// Retry relocking/unlocking PG after a short time
|
||||||
|
pg.lock_waiting = true;
|
||||||
|
tfd->set_timer(pg_lock_retry_interval_ms, false, [this, pg_id](int)
|
||||||
|
{
|
||||||
|
auto pg_it = pgs.find(pg_id);
|
||||||
|
if (pg_it != pgs.end())
|
||||||
|
{
|
||||||
|
pg_it->second.lock_waiting = false;
|
||||||
|
relock_pg(pg_it->second);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
pg.inflight_locks--;
|
||||||
|
relock_pg(pg);
|
||||||
|
delete op;
|
||||||
|
};
|
||||||
|
msgr.outbox_push(op);
|
||||||
|
}
|
||||||
|
if (pg.state & PG_PEERING)
|
||||||
|
{
|
||||||
|
pg.peering_state->locked = !diff_osds.size();
|
||||||
|
}
|
||||||
|
pg.inflight_locks--;
|
||||||
|
continue_pg(pg);
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||||
|
@ -383,15 +560,20 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
auto role_fd_it = msgr.osd_peer_fds.find(role_osd);
|
||||||
|
if (role_fd_it == msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
printf("Failed to get object list from OSD %ju because it is disconnected\n", role_osd);
|
||||||
|
return;
|
||||||
|
}
|
||||||
// Peer
|
// Peer
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
op->peer_fd = role_fd_it->second;
|
||||||
op->req = (osd_any_op_t){
|
op->req = (osd_any_op_t){
|
||||||
.sec_list = {
|
.sec_list = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_LIST,
|
.opcode = OSD_OP_SEC_LIST,
|
||||||
},
|
},
|
||||||
.list_pg = ps->pg_num,
|
.list_pg = ps->pg_num,
|
||||||
|
@ -479,13 +661,8 @@ bool osd_t::stop_pg(pg_t & pg)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
|
pg.state = pg.state & ~PG_STARTING & ~PG_PEERING & ~PG_INCOMPLETE & ~PG_ACTIVE & ~PG_REPEERING & ~PG_OFFLINE | PG_STOPPING;
|
||||||
{
|
if (pg.can_stop())
|
||||||
finish_stop_pg(pg);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
pg.state = pg.state & ~PG_ACTIVE & ~PG_REPEERING | PG_STOPPING;
|
|
||||||
if (pg.inflight == 0 && !pg.flush_batch)
|
|
||||||
{
|
{
|
||||||
finish_stop_pg(pg);
|
finish_stop_pg(pg);
|
||||||
}
|
}
|
||||||
|
@ -566,9 +743,33 @@ void osd_t::report_pg_state(pg_t & pg)
|
||||||
pg_cfg.target_history = pg.target_history;
|
pg_cfg.target_history = pg.target_history;
|
||||||
pg_cfg.all_peers = pg.all_peers;
|
pg_cfg.all_peers = pg.all_peers;
|
||||||
}
|
}
|
||||||
|
relock_pg(pg);
|
||||||
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
|
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
|
||||||
{
|
{
|
||||||
apply_pg_config();
|
apply_pg_config();
|
||||||
}
|
}
|
||||||
report_pg_states();
|
report_pg_states();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_t::rm_inflight(pg_t & pg)
|
||||||
|
{
|
||||||
|
pg.inflight--;
|
||||||
|
assert(pg.inflight >= 0);
|
||||||
|
continue_pg(pg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::continue_pg(pg_t & pg)
|
||||||
|
{
|
||||||
|
if ((pg.state & PG_STOPPING) && pg.can_stop())
|
||||||
|
{
|
||||||
|
finish_stop_pg(pg);
|
||||||
|
}
|
||||||
|
else if ((pg.state & PG_REPEERING) && pg.can_repeer())
|
||||||
|
{
|
||||||
|
start_pg_peering(pg);
|
||||||
|
}
|
||||||
|
else if ((pg.state & PG_PEERING) && pg.peering_state->locked)
|
||||||
|
{
|
||||||
|
continue_pg_peering(pg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -489,3 +489,13 @@ void pg_t::print_state()
|
||||||
total_count
|
total_count
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool pg_t::can_stop()
|
||||||
|
{
|
||||||
|
return inflight == 0 && inflight_locks == 0 && !lock_peers.size() && !flush_batch;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool pg_t::can_repeer()
|
||||||
|
{
|
||||||
|
return inflight == 0 && !flush_batch;
|
||||||
|
}
|
||||||
|
|
|
@ -49,6 +49,8 @@ struct pg_peering_state_t
|
||||||
std::map<osd_num_t, pg_list_result_t> list_results;
|
std::map<osd_num_t, pg_list_result_t> list_results;
|
||||||
pool_id_t pool_id = 0;
|
pool_id_t pool_id = 0;
|
||||||
pg_num_t pg_num = 0;
|
pg_num_t pg_num = 0;
|
||||||
|
bool locked = false;
|
||||||
|
bool lists_done = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct obj_piece_id_t
|
struct obj_piece_id_t
|
||||||
|
@ -87,6 +89,7 @@ struct pg_t
|
||||||
pool_id_t pool_id = 0;
|
pool_id_t pool_id = 0;
|
||||||
pg_num_t pg_num = 0;
|
pg_num_t pg_num = 0;
|
||||||
uint64_t clean_count = 0, total_count = 0;
|
uint64_t clean_count = 0, total_count = 0;
|
||||||
|
bool disable_pg_locks = false;
|
||||||
// epoch number - should increase with each non-clean activation of the PG
|
// epoch number - should increase with each non-clean activation of the PG
|
||||||
uint64_t epoch = 0, reported_epoch = 0;
|
uint64_t epoch = 0, reported_epoch = 0;
|
||||||
// target history and all potential peers
|
// target history and all potential peers
|
||||||
|
@ -104,6 +107,10 @@ struct pg_t
|
||||||
// cur_set is the current set of connected peer OSDs for this PG
|
// cur_set is the current set of connected peer OSDs for this PG
|
||||||
// cur_set = (role => osd_num or UINT64_MAX if missing). role numbers begin with zero
|
// cur_set = (role => osd_num or UINT64_MAX if missing). role numbers begin with zero
|
||||||
std::vector<osd_num_t> cur_set;
|
std::vector<osd_num_t> cur_set;
|
||||||
|
// locked peer list => pg state reported to the peer
|
||||||
|
std::map<osd_num_t, uint64_t> lock_peers;
|
||||||
|
int inflight_locks = 0;
|
||||||
|
bool lock_waiting = false;
|
||||||
// same thing in state_dict-like format
|
// same thing in state_dict-like format
|
||||||
pg_osd_set_t cur_loc_set;
|
pg_osd_set_t cur_loc_set;
|
||||||
// moved object map. by default, each object is considered to reside on cur_set.
|
// moved object map. by default, each object is considered to reside on cur_set.
|
||||||
|
@ -125,6 +132,9 @@ struct pg_t
|
||||||
pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
|
pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
|
||||||
void calc_object_states(int log_level);
|
void calc_object_states(int log_level);
|
||||||
void print_state();
|
void print_state();
|
||||||
|
bool can_stop();
|
||||||
|
bool can_repeer();
|
||||||
|
void rm_inflight();
|
||||||
};
|
};
|
||||||
|
|
||||||
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
||||||
|
|
|
@ -37,7 +37,20 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||||
};
|
};
|
||||||
pg_num_t pg_num = (oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
|
pg_num_t pg_num = (oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
|
||||||
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
|
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
|
||||||
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
|
if (pg_it == pgs.end() || pg_it->second.state == PG_OFFLINE)
|
||||||
|
{
|
||||||
|
// Check for a local replicated read from secondary OSD
|
||||||
|
auto lock_it = cur_op->req.hdr.opcode == OSD_OP_READ && pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||||
|
? pg_locks.find({ .pool_id = pool_id, .pg_num = pg_num })
|
||||||
|
: pg_locks.end();
|
||||||
|
if (lock_it == pg_locks.end() || lock_it->second.state != PG_ACTIVE && lock_it->second.state != (PG_ACTIVE|PG_LEFT_ON_DEAD))
|
||||||
|
{
|
||||||
|
// FIXME: Change EPIPE to something else
|
||||||
|
finish_op(cur_op, -EPIPE);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!(pg_it->second.state & PG_ACTIVE))
|
||||||
{
|
{
|
||||||
// This OSD is not primary for this PG or the PG is inactive
|
// This OSD is not primary for this PG or the PG is inactive
|
||||||
// FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
|
// FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
|
||||||
|
@ -69,7 +82,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||||
// Find parents from the same pool. Optimized reads only work within pools
|
// Find parents from the same pool. Optimized reads only work within pools
|
||||||
while (inode_it != st_cli.inode_config.end() &&
|
while (inode_it != st_cli.inode_config.end() &&
|
||||||
inode_it->second.parent_id &&
|
inode_it->second.parent_id &&
|
||||||
INODE_POOL(inode_it->second.parent_id) == pg_it->second.pool_id)
|
INODE_POOL(inode_it->second.parent_id) == pool_cfg.id)
|
||||||
{
|
{
|
||||||
// Check for loops - FIXME check it in etcd_state_client
|
// Check for loops - FIXME check it in etcd_state_client
|
||||||
if (inode_it->second.parent_id == cur_op->req.rw.inode ||
|
if (inode_it->second.parent_id == cur_op->req.rw.inode ||
|
||||||
|
@ -109,7 +122,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||||
);
|
);
|
||||||
void *data_buf = (uint8_t*)op_data + sizeof(osd_primary_op_data_t);
|
void *data_buf = (uint8_t*)op_data + sizeof(osd_primary_op_data_t);
|
||||||
op_data->pg_num = pg_num;
|
op_data->pg_num = pg_num;
|
||||||
op_data->pg = &pg_it->second;
|
op_data->pg = pg_it == pgs.end() ? NULL : &pg_it->second;
|
||||||
op_data->oid = oid;
|
op_data->oid = oid;
|
||||||
op_data->stripes = (osd_rmw_stripe_t*)data_buf;
|
op_data->stripes = (osd_rmw_stripe_t*)data_buf;
|
||||||
op_data->stripe_count = stripe_count;
|
op_data->stripe_count = stripe_count;
|
||||||
|
@ -144,7 +157,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||||
chain_num++;
|
chain_num++;
|
||||||
auto inode_it = st_cli.inode_config.find(cur_op->req.rw.inode);
|
auto inode_it = st_cli.inode_config.find(cur_op->req.rw.inode);
|
||||||
while (inode_it != st_cli.inode_config.end() && inode_it->second.parent_id &&
|
while (inode_it != st_cli.inode_config.end() && inode_it->second.parent_id &&
|
||||||
INODE_POOL(inode_it->second.parent_id) == pg_it->second.pool_id &&
|
INODE_POOL(inode_it->second.parent_id) == pool_cfg.id &&
|
||||||
// Check for loops
|
// Check for loops
|
||||||
inode_it->second.parent_id != cur_op->req.rw.inode)
|
inode_it->second.parent_id != cur_op->req.rw.inode)
|
||||||
{
|
{
|
||||||
|
@ -154,7 +167,10 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||||
chain_num++;
|
chain_num++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pg_it->second.inflight++;
|
if (op_data->pg)
|
||||||
|
{
|
||||||
|
op_data->pg->inflight++;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -194,6 +210,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
|
pg_t *pg = op_data->pg;
|
||||||
if (op_data->chain_size)
|
if (op_data->chain_size)
|
||||||
{
|
{
|
||||||
continue_chained_read(cur_op);
|
continue_chained_read(cur_op);
|
||||||
|
@ -206,11 +223,10 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||||
resume_0:
|
resume_0:
|
||||||
cur_op->reply.rw.bitmap_len = 0;
|
cur_op->reply.rw.bitmap_len = 0;
|
||||||
{
|
{
|
||||||
auto & pg = *op_data->pg;
|
|
||||||
if (cur_op->req.rw.len == 0)
|
if (cur_op->req.rw.len == 0)
|
||||||
{
|
{
|
||||||
// len=0 => bitmap read
|
// len=0 => bitmap read
|
||||||
for (int role = 0; role < pg.pg_data_size; role++)
|
for (int role = 0; role < (pg ? pg->pg_data_size : 1); role++)
|
||||||
{
|
{
|
||||||
op_data->stripes[role].read_start = 0;
|
op_data->stripes[role].read_start = 0;
|
||||||
op_data->stripes[role].read_end = UINT32_MAX;
|
op_data->stripes[role].read_end = UINT32_MAX;
|
||||||
|
@ -218,40 +234,48 @@ resume_0:
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (int role = 0; role < pg.pg_data_size; role++)
|
for (int role = 0; role < (pg ? pg->pg_data_size : 1); role++)
|
||||||
{
|
{
|
||||||
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
|
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
|
||||||
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
|
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Determine version
|
// Determine version
|
||||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
if (pg)
|
||||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
{
|
||||||
|
auto vo_it = pg->ver_override.find(op_data->oid);
|
||||||
|
op_data->target_ver = vo_it != pg->ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
// PG may have degraded or misplaced objects
|
// PG may have degraded or misplaced objects
|
||||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
op_data->prev_set = get_object_osd_set(*pg, op_data->oid, &op_data->object_state);
|
||||||
if (pg.state == PG_ACTIVE || pg.scheme == POOL_SCHEME_REPLICATED)
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
op_data->target_ver = UINT64_MAX;
|
||||||
|
op_data->prev_set = &this->osd_num;
|
||||||
|
}
|
||||||
|
if (!pg || pg->state == PG_ACTIVE || pg->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
// Fast happy-path
|
// Fast happy-path
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED &&
|
if (pg && pg->scheme == POOL_SCHEME_REPLICATED &&
|
||||||
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||||
{
|
{
|
||||||
finish_op(cur_op, -EIO);
|
finish_op(cur_op, -EIO);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_data_size, 0);
|
cur_op->buf = alloc_read_buffer(op_data->stripes, pg ? pg->pg_data_size : 1, 0);
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||||
op_data->st = 1;
|
op_data->st = 1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (extend_missing_stripes(op_data->stripes, op_data->prev_set, pg.pg_data_size, pg.pg_size) < 0)
|
if (extend_missing_stripes(op_data->stripes, op_data->prev_set, pg->pg_data_size, pg->pg_size) < 0)
|
||||||
{
|
{
|
||||||
finish_op(cur_op, -EIO);
|
finish_op(cur_op, -EIO);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Submit reads
|
// Submit reads
|
||||||
op_data->degraded = 1;
|
op_data->degraded = 1;
|
||||||
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
|
cur_op->buf = alloc_read_buffer(op_data->stripes, pg->pg_size, 0);
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||||
op_data->st = 1;
|
op_data->st = 1;
|
||||||
}
|
}
|
||||||
|
@ -261,32 +285,32 @@ resume_1:
|
||||||
resume_2:
|
resume_2:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
if (pg && (op_data->errcode == -EIO || op_data->errcode == -EDOM))
|
||||||
{
|
{
|
||||||
// I/O or checksum error
|
// I/O or checksum error
|
||||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||||
op_data->object_state = mark_object_corrupted(*op_data->pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
op_data->object_state = mark_object_corrupted(*pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
}
|
}
|
||||||
finish_op(cur_op, op_data->errcode);
|
finish_op(cur_op, op_data->errcode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cur_op->reply.rw.version = op_data->fact_ver;
|
cur_op->reply.rw.version = op_data->fact_ver;
|
||||||
cur_op->reply.rw.bitmap_len = op_data->pg->pg_data_size * clean_entry_bitmap_size;
|
cur_op->reply.rw.bitmap_len = (pg ? pg->pg_data_size : 1) * clean_entry_bitmap_size;
|
||||||
if (op_data->degraded)
|
if (op_data->degraded)
|
||||||
{
|
{
|
||||||
// Reconstruct missing stripes
|
// Reconstruct missing stripes
|
||||||
osd_rmw_stripe_t *stripes = op_data->stripes;
|
osd_rmw_stripe_t *stripes = op_data->stripes;
|
||||||
if (op_data->pg->scheme == POOL_SCHEME_XOR)
|
if (pg->scheme == POOL_SCHEME_XOR)
|
||||||
{
|
{
|
||||||
reconstruct_stripes_xor(stripes, op_data->pg->pg_size, clean_entry_bitmap_size);
|
reconstruct_stripes_xor(stripes, pg->pg_size, clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
else if (op_data->pg->scheme == POOL_SCHEME_EC)
|
else if (pg->scheme == POOL_SCHEME_EC)
|
||||||
{
|
{
|
||||||
reconstruct_stripes_ec(stripes, op_data->pg->pg_size, op_data->pg->pg_data_size, clean_entry_bitmap_size);
|
reconstruct_stripes_ec(stripes, pg->pg_size, pg->pg_data_size, clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
for (int role = 0; role < pg->pg_size; role++)
|
||||||
{
|
{
|
||||||
if (stripes[role].req_end != 0)
|
if (stripes[role].req_end != 0)
|
||||||
{
|
{
|
||||||
|
@ -632,7 +656,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
|
||||||
{
|
{
|
||||||
this->misplaced_objects--;
|
this->misplaced_objects--;
|
||||||
pg.misplaced_objects.erase(oid);
|
pg.misplaced_objects.erase(oid);
|
||||||
if (!pg.misplaced_objects.size())
|
if (!pg.misplaced_objects.size() && !pg.copies_to_delete_after_sync.size())
|
||||||
{
|
{
|
||||||
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
||||||
changed = true;
|
changed = true;
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
void osd_t::continue_chained_read(osd_op_t *cur_op)
|
void osd_t::continue_chained_read(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
auto & pg = *op_data->pg;
|
auto pg = op_data->pg;
|
||||||
if (op_data->st == 1)
|
if (op_data->st == 1)
|
||||||
goto resume_1;
|
goto resume_1;
|
||||||
else if (op_data->st == 2)
|
else if (op_data->st == 2)
|
||||||
|
@ -17,7 +17,7 @@ void osd_t::continue_chained_read(osd_op_t *cur_op)
|
||||||
else if (op_data->st == 4)
|
else if (op_data->st == 4)
|
||||||
goto resume_4;
|
goto resume_4;
|
||||||
cur_op->reply.rw.bitmap_len = 0;
|
cur_op->reply.rw.bitmap_len = 0;
|
||||||
for (int role = 0; role < pg.pg_data_size; role++)
|
for (int role = 0; role < (pg ? pg->pg_data_size : 1); role++)
|
||||||
{
|
{
|
||||||
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
|
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
|
||||||
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
|
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
|
||||||
|
@ -40,10 +40,10 @@ resume_3:
|
||||||
resume_4:
|
resume_4:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
if (pg && (op_data->errcode == -EIO || op_data->errcode == -EDOM))
|
||||||
{
|
{
|
||||||
// Handle corrupted reads and retry...
|
// Handle corrupted reads and retry...
|
||||||
check_corrupted_chained(pg, cur_op);
|
check_corrupted_chained(*pg, cur_op);
|
||||||
free(cur_op->buf);
|
free(cur_op->buf);
|
||||||
cur_op->buf = NULL;
|
cur_op->buf = NULL;
|
||||||
free(op_data->chain_reads);
|
free(op_data->chain_reads);
|
||||||
|
@ -63,31 +63,30 @@ resume_4:
|
||||||
finish_op(cur_op, cur_op->req.rw.len);
|
finish_op(cur_op, cur_op->req.rw.len);
|
||||||
}
|
}
|
||||||
|
|
||||||
int osd_t::read_bitmaps(osd_op_t *cur_op, pg_t & pg, int base_state)
|
int osd_t::read_bitmaps(osd_op_t *cur_op, pg_t *pg, int base_state)
|
||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
if (op_data->st == base_state)
|
if (op_data->st == base_state)
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
else if (op_data->st == base_state+1)
|
else if (op_data->st == base_state+1)
|
||||||
goto resume_1;
|
goto resume_1;
|
||||||
if (pg.state == PG_ACTIVE && pg.scheme == POOL_SCHEME_REPLICATED)
|
if (!pg || pg->state == PG_ACTIVE && pg->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
// Happy path for clean replicated PGs (all bitmaps are available locally)
|
// Happy path for clean replicated PGs (all bitmaps are available locally)
|
||||||
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
for (int chain_num = 0; chain_num < op_data->chain_size; chain_num++)
|
for (int chain_num = 0; chain_num < op_data->chain_size; chain_num++)
|
||||||
{
|
{
|
||||||
object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
|
object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
|
||||||
auto vo_it = pg.ver_override.find(cur_oid);
|
|
||||||
auto read_version = (vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX);
|
|
||||||
// Read bitmap synchronously from the local database
|
// Read bitmap synchronously from the local database
|
||||||
bs->read_bitmap(
|
bs->read_bitmap(
|
||||||
cur_oid, read_version, (uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size,
|
cur_oid, UINT64_MAX, (uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size,
|
||||||
!chain_num ? &cur_op->reply.rw.version : NULL
|
!chain_num ? &cur_op->reply.rw.version : NULL
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (submit_bitmap_subops(cur_op, pg) < 0)
|
if (submit_bitmap_subops(cur_op, *pg) < 0)
|
||||||
{
|
{
|
||||||
// Failure
|
// Failure
|
||||||
finish_op(cur_op, -EIO);
|
finish_op(cur_op, -EIO);
|
||||||
|
@ -101,32 +100,32 @@ resume_0:
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
resume_1:
|
resume_1:
|
||||||
if (pg.scheme != POOL_SCHEME_REPLICATED)
|
if (pg->scheme != POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
for (int chain_num = 0; chain_num < op_data->chain_size; chain_num++)
|
for (int chain_num = 0; chain_num < op_data->chain_size; chain_num++)
|
||||||
{
|
{
|
||||||
// Check if we need to reconstruct any bitmaps
|
// Check if we need to reconstruct any bitmaps
|
||||||
for (int i = 0; i < pg.pg_size; i++)
|
for (int i = 0; i < pg->pg_size; i++)
|
||||||
{
|
{
|
||||||
if (op_data->missing_flags[chain_num*pg.pg_size + i])
|
if (op_data->missing_flags[chain_num*pg->pg_size + i])
|
||||||
{
|
{
|
||||||
osd_rmw_stripe_t local_stripes[pg.pg_size];
|
osd_rmw_stripe_t local_stripes[pg->pg_size];
|
||||||
for (i = 0; i < pg.pg_size; i++)
|
for (i = 0; i < pg->pg_size; i++)
|
||||||
{
|
{
|
||||||
local_stripes[i] = (osd_rmw_stripe_t){
|
local_stripes[i] = (osd_rmw_stripe_t){
|
||||||
.bmp_buf = (uint8_t*)op_data->snapshot_bitmaps + (chain_num*pg.pg_size + i)*clean_entry_bitmap_size,
|
.bmp_buf = (uint8_t*)op_data->snapshot_bitmaps + (chain_num*pg->pg_size + i)*clean_entry_bitmap_size,
|
||||||
.read_start = 1,
|
.read_start = 1,
|
||||||
.read_end = 1,
|
.read_end = 1,
|
||||||
.missing = op_data->missing_flags[chain_num*pg.pg_size + i] && true,
|
.missing = op_data->missing_flags[chain_num*pg->pg_size + i] && true,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (pg.scheme == POOL_SCHEME_XOR)
|
if (pg->scheme == POOL_SCHEME_XOR)
|
||||||
{
|
{
|
||||||
reconstruct_stripes_xor(local_stripes, pg.pg_size, clean_entry_bitmap_size);
|
reconstruct_stripes_xor(local_stripes, pg->pg_size, clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
else if (pg.scheme == POOL_SCHEME_EC)
|
else if (pg->scheme == POOL_SCHEME_EC)
|
||||||
{
|
{
|
||||||
reconstruct_stripes_ec(local_stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
|
reconstruct_stripes_ec(local_stripes, pg->pg_size, pg->pg_data_size, clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -139,6 +138,7 @@ resume_1:
|
||||||
|
|
||||||
int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests)
|
int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests)
|
||||||
{
|
{
|
||||||
|
assert(&pg);
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
for (int chain_num = 0; chain_num < op_data->chain_size; chain_num++)
|
for (int chain_num = 0; chain_num < op_data->chain_size; chain_num++)
|
||||||
{
|
{
|
||||||
|
@ -216,6 +216,7 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
|
||||||
|
|
||||||
int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||||
{
|
{
|
||||||
|
assert(&pg);
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
std::vector<bitmap_request_t> *bitmap_requests = new std::vector<bitmap_request_t>();
|
std::vector<bitmap_request_t> *bitmap_requests = new std::vector<bitmap_request_t>();
|
||||||
if (collect_bitmap_requests(cur_op, pg, *bitmap_requests) < 0)
|
if (collect_bitmap_requests(cur_op, pg, *bitmap_requests) < 0)
|
||||||
|
@ -266,7 +267,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||||
.sec_read_bmp = {
|
.sec_read_bmp = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_READ_BMP,
|
.opcode = OSD_OP_SEC_READ_BMP,
|
||||||
},
|
},
|
||||||
.len = sizeof(obj_ver_id)*(i+1-prev),
|
.len = sizeof(obj_ver_id)*(i+1-prev),
|
||||||
|
@ -383,12 +383,12 @@ std::vector<osd_chain_read_t> osd_t::collect_chained_read_requests(osd_op_t *cur
|
||||||
return chain_reads;
|
return chain_reads;
|
||||||
}
|
}
|
||||||
|
|
||||||
int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
int osd_t::submit_chained_read_requests(pg_t *pg, osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
// Decide which parts of which objects we need to read based on bitmaps
|
// Decide which parts of which objects we need to read based on bitmaps
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
auto chain_reads = collect_chained_read_requests(cur_op);
|
auto chain_reads = collect_chained_read_requests(cur_op);
|
||||||
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
int stripe_count = (!pg || pg->scheme == POOL_SCHEME_REPLICATED ? 1 : pg->pg_size);
|
||||||
op_data->chain_read_count = chain_reads.size();
|
op_data->chain_read_count = chain_reads.size();
|
||||||
op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
|
op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
|
||||||
1, sizeof(osd_chain_read_t) * chain_reads.size()
|
1, sizeof(osd_chain_read_t) * chain_reads.size()
|
||||||
|
@ -409,23 +409,23 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||||
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||||
// FIXME: maybe introduce split_read_stripes to shorten these lines and to remove read_start=req_start
|
// FIXME: maybe introduce split_read_stripes to shorten these lines and to remove read_start=req_start
|
||||||
osd_rmw_stripe_t *stripes = chain_stripes + chain_reads[cri].chain_pos*stripe_count;
|
osd_rmw_stripe_t *stripes = chain_stripes + chain_reads[cri].chain_pos*stripe_count;
|
||||||
split_stripes(pg.pg_data_size, bs_block_size, chain_reads[cri].offset, chain_reads[cri].len, stripes);
|
split_stripes(pg ? pg->pg_data_size : 1, bs_block_size, chain_reads[cri].offset, chain_reads[cri].len, stripes);
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED && !stripes[0].req_end)
|
if ((!pg || pg->scheme == POOL_SCHEME_REPLICATED) && !stripes[0].req_end)
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (int role = 0; role < pg.pg_data_size; role++)
|
for (int role = 0; role < (pg ? pg->pg_data_size : 1); role++)
|
||||||
{
|
{
|
||||||
stripes[role].read_start = stripes[role].req_start;
|
stripes[role].read_start = stripes[role].req_start;
|
||||||
stripes[role].read_end = stripes[role].req_end;
|
stripes[role].read_end = stripes[role].req_end;
|
||||||
}
|
}
|
||||||
uint64_t *cur_set = pg.cur_set.data();
|
uint64_t *cur_set = pg ? pg->cur_set.data() : &this->osd_num;
|
||||||
if (pg.state != PG_ACTIVE)
|
if (pg && pg->state != PG_ACTIVE)
|
||||||
{
|
{
|
||||||
cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
|
cur_set = get_object_osd_set(*pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
|
||||||
if (pg.scheme != POOL_SCHEME_REPLICATED)
|
if (pg->scheme != POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
if (extend_missing_stripes(stripes, cur_set, pg->pg_data_size, pg->pg_size) < 0)
|
||||||
{
|
{
|
||||||
free(op_data->chain_reads);
|
free(op_data->chain_reads);
|
||||||
op_data->chain_reads = NULL;
|
op_data->chain_reads = NULL;
|
||||||
|
@ -446,14 +446,14 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
if (!pg || pg->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
n_subops++;
|
n_subops++;
|
||||||
read_buffer_size += stripes[0].read_end - stripes[0].read_start;
|
read_buffer_size += stripes[0].read_end - stripes[0].read_start;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (int role = 0; role < pg.pg_size; role++)
|
for (int role = 0; role < pg->pg_size; role++)
|
||||||
{
|
{
|
||||||
if (stripes[role].read_end > 0 && cur_set[role] != 0)
|
if (stripes[role].read_end > 0 && cur_set[role] != 0)
|
||||||
n_subops++;
|
n_subops++;
|
||||||
|
@ -491,19 +491,23 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||||
for (int cri = 0; cri < chain_reads.size(); cri++)
|
for (int cri = 0; cri < chain_reads.size(); cri++)
|
||||||
{
|
{
|
||||||
osd_rmw_stripe_t *stripes = chain_stripes + chain_reads[cri].chain_pos*stripe_count;
|
osd_rmw_stripe_t *stripes = chain_stripes + chain_reads[cri].chain_pos*stripe_count;
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED && !stripes[0].req_end)
|
if ((!pg || pg->scheme == POOL_SCHEME_REPLICATED) && !stripes[0].req_end)
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||||
auto vo_it = pg.ver_override.find(cur_oid);
|
uint64_t target_ver = UINT64_MAX;
|
||||||
uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
if (pg)
|
||||||
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
|
||||||
uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
|
|
||||||
int zero_read = -1;
|
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
|
||||||
{
|
{
|
||||||
for (int role = 0; role < pg.pg_size; role++)
|
auto vo_it = pg->ver_override.find(cur_oid);
|
||||||
|
target_ver = vo_it != pg->ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
|
}
|
||||||
|
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||||
|
uint64_t *cur_set = (!pg ? &this->osd_num : (pg->state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg->cur_set.data()));
|
||||||
|
int zero_read = -1;
|
||||||
|
if (!pg || pg->scheme == POOL_SCHEME_REPLICATED)
|
||||||
|
{
|
||||||
|
for (int role = 0; role < (pg ? pg->pg_size : 1); role++)
|
||||||
if (cur_set[role] == this->osd_num || zero_read == -1)
|
if (cur_set[role] == this->osd_num || zero_read == -1)
|
||||||
zero_read = role;
|
zero_read = role;
|
||||||
}
|
}
|
||||||
|
@ -515,6 +519,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||||
|
|
||||||
void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
|
assert(&pg);
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
||||||
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||||
|
@ -540,33 +545,32 @@ void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
|
void osd_t::send_chained_read_results(pg_t *pg, osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
int stripe_count = (!pg || pg->scheme == POOL_SCHEME_REPLICATED ? 1 : pg->pg_size);
|
||||||
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||||
(uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
|
(uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
|
||||||
);
|
);
|
||||||
// Reconstruct parts if needed
|
// Reconstruct parts if needed
|
||||||
if (op_data->degraded)
|
if (op_data->degraded)
|
||||||
{
|
{
|
||||||
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
|
||||||
for (int cri = 0; cri < op_data->chain_read_count; cri++)
|
for (int cri = 0; cri < op_data->chain_read_count; cri++)
|
||||||
{
|
{
|
||||||
// Reconstruct missing stripes
|
// Reconstruct missing stripes
|
||||||
osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
|
osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
|
||||||
if (pg.scheme == POOL_SCHEME_XOR)
|
if (pg->scheme == POOL_SCHEME_XOR)
|
||||||
{
|
{
|
||||||
reconstruct_stripes_xor(stripes, pg.pg_size, clean_entry_bitmap_size);
|
reconstruct_stripes_xor(stripes, pg->pg_size, clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
else if (pg.scheme == POOL_SCHEME_EC)
|
else if (pg->scheme == POOL_SCHEME_EC)
|
||||||
{
|
{
|
||||||
reconstruct_stripes_ec(stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
|
reconstruct_stripes_ec(stripes, pg->pg_size, pg->pg_data_size, clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Send bitmap
|
// Send bitmap
|
||||||
cur_op->reply.rw.bitmap_len = pg.pg_data_size * clean_entry_bitmap_size;
|
cur_op->reply.rw.bitmap_len = (pg ? pg->pg_data_size : 1) * clean_entry_bitmap_size;
|
||||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||||
// And finally compose the result
|
// And finally compose the result
|
||||||
uint64_t sent = 0;
|
uint64_t sent = 0;
|
||||||
|
|
|
@ -67,26 +67,20 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
||||||
{
|
{
|
||||||
if (cur_op->op_data)
|
if (cur_op->op_data)
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg->pg_data_size * bs_block_size;
|
{
|
||||||
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += (cur_op->op_data->pg
|
||||||
|
? cur_op->op_data->pg->pg_data_size : 1) * bs_block_size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
||||||
}
|
}
|
||||||
if (cur_op->op_data)
|
if (cur_op->op_data)
|
||||||
{
|
{
|
||||||
if (cur_op->op_data->pg_num > 0)
|
if (cur_op->op_data->pg)
|
||||||
{
|
{
|
||||||
auto & pg = *cur_op->op_data->pg;
|
auto & pg = *cur_op->op_data->pg;
|
||||||
pg.inflight--;
|
rm_inflight(pg);
|
||||||
assert(pg.inflight >= 0);
|
|
||||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
|
||||||
{
|
|
||||||
finish_stop_pg(pg);
|
|
||||||
}
|
|
||||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
|
||||||
{
|
|
||||||
start_pg_peering(pg);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
assert(!cur_op->op_data->subops);
|
assert(!cur_op->op_data->subops);
|
||||||
free(cur_op->op_data);
|
free(cur_op->op_data);
|
||||||
|
@ -126,10 +120,10 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
||||||
bool wr = submit_type == SUBMIT_WRITE;
|
bool wr = submit_type == SUBMIT_WRITE;
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
osd_rmw_stripe_t *stripes = op_data->stripes;
|
osd_rmw_stripe_t *stripes = op_data->stripes;
|
||||||
bool rep = op_data->pg->scheme == POOL_SCHEME_REPLICATED;
|
bool rep = !op_data->pg || op_data->pg->scheme == POOL_SCHEME_REPLICATED;
|
||||||
// Allocate subops
|
// Allocate subops
|
||||||
int n_subops = 0, zero_read = -1;
|
int n_subops = 0, zero_read = -1;
|
||||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
for (int role = 0; role < (op_data->pg ? op_data->pg->pg_size : 1); role++)
|
||||||
{
|
{
|
||||||
if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
|
if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
|
||||||
zero_read = role;
|
zero_read = role;
|
||||||
|
@ -152,11 +146,11 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
||||||
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,
|
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,
|
||||||
osd_rmw_stripe_t *stripes, const uint64_t* osd_set, osd_op_t *cur_op, int subop_idx, int zero_read)
|
osd_rmw_stripe_t *stripes, const uint64_t* osd_set, osd_op_t *cur_op, int subop_idx, int zero_read)
|
||||||
{
|
{
|
||||||
bool rep = cur_op->op_data->pg->scheme == POOL_SCHEME_REPLICATED;
|
bool rep = !cur_op->op_data->pg || cur_op->op_data->pg->scheme == POOL_SCHEME_REPLICATED;
|
||||||
bool wr = submit_type == SUBMIT_WRITE;
|
bool wr = submit_type == SUBMIT_WRITE;
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
int i = subop_idx;
|
int i = subop_idx;
|
||||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
for (int role = 0; role < (op_data->pg ? op_data->pg->pg_size : 1); role++)
|
||||||
{
|
{
|
||||||
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
||||||
|
@ -233,7 +227,6 @@ void osd_t::submit_primary_subop(osd_op_t *cur_op, osd_op_t *subop,
|
||||||
subop->req.sec_rw = (osd_op_sec_rw_t){
|
subop->req.sec_rw = (osd_op_sec_rw_t){
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = (uint64_t)(wr ? (cur_op->op_data->pg->scheme == POOL_SCHEME_REPLICATED ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
|
.opcode = (uint64_t)(wr ? (cur_op->op_data->pg->scheme == POOL_SCHEME_REPLICATED ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
|
||||||
},
|
},
|
||||||
.oid = {
|
.oid = {
|
||||||
|
@ -435,6 +428,14 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||||
retval, expected, peer_osd
|
retval, expected, peer_osd
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
else if (opcode == OSD_OP_SEC_DELETE)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"delete subop to %jx:%jx v%ju failed on osd %jd: retval = %d (expected %d)\n",
|
||||||
|
subop->req.sec_del.oid.inode, subop->req.sec_del.oid.stripe, subop->req.sec_del.version,
|
||||||
|
peer_osd, retval, expected
|
||||||
|
);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
|
@ -452,15 +453,16 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
op_data->errcode = retval;
|
op_data->errcode = retval;
|
||||||
}
|
}
|
||||||
op_data->errors++;
|
|
||||||
if (subop->peer_fd >= 0 && retval != -EDOM && retval != -ERANGE &&
|
if (subop->peer_fd >= 0 && retval != -EDOM && retval != -ERANGE &&
|
||||||
(retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
|
(retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
|
||||||
(retval != -EIO || opcode != OSD_OP_SEC_READ))
|
(retval != -EIO || opcode != OSD_OP_SEC_READ))
|
||||||
{
|
{
|
||||||
// Drop connection on unexpected errors
|
// Drop connection on unexpected errors
|
||||||
op_data->drops++;
|
|
||||||
msgr.stop_client(subop->peer_fd);
|
msgr.stop_client(subop->peer_fd);
|
||||||
|
op_data->drops++;
|
||||||
}
|
}
|
||||||
|
// Increase op_data->errors after stop_client to prevent >= n_subops running twice
|
||||||
|
op_data->errors++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -593,7 +595,6 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_DELETE,
|
.opcode = OSD_OP_SEC_DELETE,
|
||||||
},
|
},
|
||||||
.oid = chunk.oid,
|
.oid = chunk.oid,
|
||||||
|
@ -653,7 +654,6 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||||
subops[i].req = (osd_any_op_t){ .sec_sync = {
|
subops[i].req = (osd_any_op_t){ .sec_sync = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_SYNC,
|
.opcode = OSD_OP_SEC_SYNC,
|
||||||
},
|
},
|
||||||
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
|
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
|
||||||
|
@ -712,7 +712,6 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_STABILIZE,
|
.opcode = OSD_OP_SEC_STABILIZE,
|
||||||
},
|
},
|
||||||
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
|
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
|
||||||
|
@ -806,7 +805,6 @@ void osd_t::submit_primary_rollback_subops(osd_op_t *cur_op, const uint64_t* osd
|
||||||
subop->req = (osd_any_op_t){ .sec_stab = {
|
subop->req = (osd_any_op_t){ .sec_stab = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_ROLLBACK,
|
.opcode = OSD_OP_SEC_ROLLBACK,
|
||||||
},
|
},
|
||||||
.len = sizeof(obj_ver_id),
|
.len = sizeof(obj_ver_id),
|
||||||
|
|
|
@ -80,15 +80,17 @@ resume_2:
|
||||||
this->unstable_writes.clear();
|
this->unstable_writes.clear();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
|
op_data->dirty_pg_count = dirty_pgs.size();
|
||||||
|
op_data->dirty_osd_count = dirty_osds.size();
|
||||||
void *dirty_buf = malloc_or_die(
|
void *dirty_buf = malloc_or_die(
|
||||||
sizeof(pool_pg_num_t)*dirty_pgs.size() +
|
sizeof(pool_pg_num_t)*dirty_pgs.size() +
|
||||||
|
sizeof(uint64_t)*dirty_pgs.size() +
|
||||||
sizeof(osd_num_t)*dirty_osds.size() +
|
sizeof(osd_num_t)*dirty_osds.size() +
|
||||||
sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
|
sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
|
||||||
);
|
);
|
||||||
op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
|
op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
|
||||||
op_data->dirty_osds = (osd_num_t*)((uint8_t*)dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
|
uint64_t *pg_del_counts = (uint64_t*)((uint8_t*)op_data->dirty_pgs + (sizeof(pool_pg_num_t))*op_data->dirty_pg_count);
|
||||||
op_data->dirty_pg_count = dirty_pgs.size();
|
op_data->dirty_osds = (osd_num_t*)((uint8_t*)pg_del_counts + 8*op_data->dirty_pg_count);
|
||||||
op_data->dirty_osd_count = dirty_osds.size();
|
|
||||||
if (this->copies_to_delete_after_sync_count)
|
if (this->copies_to_delete_after_sync_count)
|
||||||
{
|
{
|
||||||
op_data->copies_to_delete_count = 0;
|
op_data->copies_to_delete_count = 0;
|
||||||
|
@ -103,16 +105,16 @@ resume_2:
|
||||||
sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
|
sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
|
||||||
);
|
);
|
||||||
op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
|
op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
|
||||||
this->copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
|
||||||
pg.copies_to_delete_after_sync.clear();
|
|
||||||
}
|
}
|
||||||
assert(this->copies_to_delete_after_sync_count == 0);
|
|
||||||
}
|
}
|
||||||
int dpg = 0;
|
int dpg = 0;
|
||||||
for (auto dirty_pg_num: dirty_pgs)
|
for (auto dirty_pg_num: dirty_pgs)
|
||||||
{
|
{
|
||||||
pgs.at(dirty_pg_num).inflight++;
|
auto & pg = pgs.at(dirty_pg_num);
|
||||||
op_data->dirty_pgs[dpg++] = dirty_pg_num;
|
pg.inflight++;
|
||||||
|
op_data->dirty_pgs[dpg] = dirty_pg_num;
|
||||||
|
pg_del_counts[dpg] = pg.copies_to_delete_after_sync.size();
|
||||||
|
dpg++;
|
||||||
}
|
}
|
||||||
dirty_pgs.clear();
|
dirty_pgs.clear();
|
||||||
dpg = 0;
|
dpg = 0;
|
||||||
|
@ -183,23 +185,6 @@ resume_6:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (op_data->copies_to_delete)
|
|
||||||
{
|
|
||||||
// Return 'copies to delete' back into respective PGs
|
|
||||||
for (int i = 0; i < op_data->copies_to_delete_count; i++)
|
|
||||||
{
|
|
||||||
auto & w = op_data->copies_to_delete[i];
|
|
||||||
auto & pg = pgs.at((pool_pg_num_t){
|
|
||||||
.pool_id = INODE_POOL(w.oid.inode),
|
|
||||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
|
||||||
});
|
|
||||||
if (pg.state & PG_ACTIVE)
|
|
||||||
{
|
|
||||||
pg.copies_to_delete_after_sync.push_back(w);
|
|
||||||
copies_to_delete_after_sync_count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if (op_data->copies_to_delete)
|
else if (op_data->copies_to_delete)
|
||||||
{
|
{
|
||||||
|
@ -213,6 +198,22 @@ resume_8:
|
||||||
{
|
{
|
||||||
goto resume_6;
|
goto resume_6;
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
uint64_t *pg_del_counts = (uint64_t*)((uint8_t*)op_data->dirty_pgs + (sizeof(pool_pg_num_t))*op_data->dirty_pg_count);
|
||||||
|
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||||
|
{
|
||||||
|
auto & pg = pgs.at(op_data->dirty_pgs[i]);
|
||||||
|
auto n = pg_del_counts[i];
|
||||||
|
assert(copies_to_delete_after_sync_count >= n);
|
||||||
|
copies_to_delete_after_sync_count -= n;
|
||||||
|
pg.copies_to_delete_after_sync.erase(pg.copies_to_delete_after_sync.begin(), pg.copies_to_delete_after_sync.begin()+n);
|
||||||
|
if (!pg.misplaced_objects.size() && !pg.copies_to_delete_after_sync.size() && (pg.state & PG_HAS_MISPLACED))
|
||||||
|
{
|
||||||
|
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
||||||
|
report_pg_state(pg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if (immediate_commit == IMMEDIATE_NONE)
|
if (immediate_commit == IMMEDIATE_NONE)
|
||||||
{
|
{
|
||||||
// Mark OSDs as dirty because deletions have to be synced too!
|
// Mark OSDs as dirty because deletions have to be synced too!
|
||||||
|
@ -226,15 +227,7 @@ resume_8:
|
||||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||||
{
|
{
|
||||||
auto & pg = pgs.at(op_data->dirty_pgs[i]);
|
auto & pg = pgs.at(op_data->dirty_pgs[i]);
|
||||||
pg.inflight--;
|
rm_inflight(pg);
|
||||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
|
||||||
{
|
|
||||||
finish_stop_pg(pg);
|
|
||||||
}
|
|
||||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
|
||||||
{
|
|
||||||
start_pg_peering(pg);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// FIXME: Free those in the destructor (not here)?
|
// FIXME: Free those in the destructor (not here)?
|
||||||
free(op_data->dirty_pgs);
|
free(op_data->dirty_pgs);
|
||||||
|
|
|
@ -301,6 +301,38 @@ resume_12:
|
||||||
}
|
}
|
||||||
if (op_data->object_state)
|
if (op_data->object_state)
|
||||||
{
|
{
|
||||||
|
// Any kind of a non-clean object can have extra chunks, because we don't record objects
|
||||||
|
// as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
|
||||||
|
if (immediate_commit != IMMEDIATE_ALL)
|
||||||
|
{
|
||||||
|
// We can't remove extra chunks yet if fsyncs are explicit, because
|
||||||
|
// new copies may not be committed to stable storage yet
|
||||||
|
// We can only remove extra chunks after a successful SYNC for this PG
|
||||||
|
for (auto & chunk: op_data->object_state->osd_set)
|
||||||
|
{
|
||||||
|
// Check is the same as in submit_primary_del_subops()
|
||||||
|
if (pg.scheme == POOL_SCHEME_REPLICATED
|
||||||
|
? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
|
||||||
|
: (chunk.osd_num != pg.cur_set[chunk.role]))
|
||||||
|
{
|
||||||
|
pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
|
||||||
|
.osd_num = chunk.osd_num,
|
||||||
|
.oid = {
|
||||||
|
.inode = op_data->oid.inode,
|
||||||
|
.stripe = op_data->oid.stripe | (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
|
||||||
|
},
|
||||||
|
.version = op_data->fact_ver,
|
||||||
|
});
|
||||||
|
copies_to_delete_after_sync_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (pg.copies_to_delete_after_sync.size() && !(pg.state & PG_HAS_MISPLACED))
|
||||||
|
{
|
||||||
|
// PG can't be active+clean until extra copies aren't removed, so mark it as PG_HAS_MISPLACED
|
||||||
|
pg.state |= PG_HAS_MISPLACED;
|
||||||
|
//this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||||
|
}
|
||||||
|
}
|
||||||
// We must forget the unclean state of the object before deleting it
|
// We must forget the unclean state of the object before deleting it
|
||||||
// so the next reads don't accidentally read a deleted version
|
// so the next reads don't accidentally read a deleted version
|
||||||
// And it should be done at the same time as the removal of the version override
|
// And it should be done at the same time as the removal of the version override
|
||||||
|
@ -309,6 +341,7 @@ resume_12:
|
||||||
}
|
}
|
||||||
resume_6:
|
resume_6:
|
||||||
resume_7:
|
resume_7:
|
||||||
|
op_data->n_subops = 0;
|
||||||
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
|
@ -344,36 +377,10 @@ resume_7:
|
||||||
);
|
);
|
||||||
recovery_stat[recovery_type].usec += usec;
|
recovery_stat[recovery_type].usec += usec;
|
||||||
}
|
}
|
||||||
// Any kind of a non-clean object can have extra chunks, because we don't record objects
|
if (immediate_commit == IMMEDIATE_ALL)
|
||||||
// as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
|
|
||||||
if (immediate_commit != IMMEDIATE_ALL)
|
|
||||||
{
|
|
||||||
// We can't remove extra chunks yet if fsyncs are explicit, because
|
|
||||||
// new copies may not be committed to stable storage yet
|
|
||||||
// We can only remove extra chunks after a successful SYNC for this PG
|
|
||||||
for (auto & chunk: op_data->object_state->osd_set)
|
|
||||||
{
|
|
||||||
// Check is the same as in submit_primary_del_subops()
|
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED
|
|
||||||
? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
|
|
||||||
: (chunk.osd_num != pg.cur_set[chunk.role]))
|
|
||||||
{
|
|
||||||
pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
|
|
||||||
.osd_num = chunk.osd_num,
|
|
||||||
.oid = {
|
|
||||||
.inode = op_data->oid.inode,
|
|
||||||
.stripe = op_data->oid.stripe | (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
|
|
||||||
},
|
|
||||||
.version = op_data->fact_ver,
|
|
||||||
});
|
|
||||||
copies_to_delete_after_sync_count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
deref_object_state(pg, &op_data->object_state, true);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||||
|
}
|
||||||
deref_object_state(pg, &op_data->object_state, true);
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
if (op_data->n_subops > 0)
|
if (op_data->n_subops > 0)
|
||||||
{
|
{
|
||||||
|
@ -388,7 +395,6 @@ resume_9:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||||
cur_op->reply.rw.version = op_data->fact_ver;
|
cur_op->reply.rw.version = op_data->fact_ver;
|
||||||
continue_others:
|
continue_others:
|
||||||
|
|
|
@ -162,6 +162,7 @@ struct reed_sol_matrix_t
|
||||||
int refs = 0;
|
int refs = 0;
|
||||||
int *je_data;
|
int *je_data;
|
||||||
uint8_t *isal_data;
|
uint8_t *isal_data;
|
||||||
|
int isal_item_size;
|
||||||
// 32 bytes = 256/8 = max pg_size/8
|
// 32 bytes = 256/8 = max pg_size/8
|
||||||
std::map<std::array<uint8_t, 32>, void*> subdata;
|
std::map<std::array<uint8_t, 32>, void*> subdata;
|
||||||
std::map<reed_sol_erased_t, void*> decodings;
|
std::map<reed_sol_erased_t, void*> decodings;
|
||||||
|
@ -181,20 +182,42 @@ void use_ec(int pg_size, int pg_minsize, bool use)
|
||||||
}
|
}
|
||||||
int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W);
|
int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W);
|
||||||
uint8_t *isal_table = NULL;
|
uint8_t *isal_table = NULL;
|
||||||
|
int item_size = 8;
|
||||||
#ifdef WITH_ISAL
|
#ifdef WITH_ISAL
|
||||||
uint8_t *isal_matrix = (uint8_t*)malloc_or_die(pg_minsize*(pg_size-pg_minsize));
|
uint8_t *isal_matrix = (uint8_t*)malloc_or_die(pg_minsize*(pg_size-pg_minsize));
|
||||||
for (int i = 0; i < pg_minsize*(pg_size-pg_minsize); i++)
|
for (int i = 0; i < pg_minsize*(pg_size-pg_minsize); i++)
|
||||||
{
|
{
|
||||||
isal_matrix[i] = matrix[i];
|
isal_matrix[i] = matrix[i];
|
||||||
}
|
}
|
||||||
isal_table = (uint8_t*)malloc_or_die(pg_minsize*(pg_size-pg_minsize)*32);
|
isal_table = (uint8_t*)calloc_or_die(1, pg_minsize*(pg_size-pg_minsize)*32);
|
||||||
ec_init_tables(pg_minsize, pg_size-pg_minsize, isal_matrix, isal_table);
|
ec_init_tables(pg_minsize, pg_size-pg_minsize, isal_matrix, isal_table);
|
||||||
free(isal_matrix);
|
free(isal_matrix);
|
||||||
|
for (int i = pg_minsize*(pg_size-pg_minsize)*8; i < pg_minsize*(pg_size-pg_minsize)*32; i++)
|
||||||
|
{
|
||||||
|
if (isal_table[i] != 0)
|
||||||
|
{
|
||||||
|
// ISA-L GF-NI version uses 8-byte table items
|
||||||
|
item_size = 32;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Sanity check: rows should never consist of all zeroes
|
||||||
|
uint8_t zero_row[pg_minsize*item_size];
|
||||||
|
memset(zero_row, 0, pg_minsize*item_size);
|
||||||
|
for (int i = 0; i < (pg_size-pg_minsize); i++)
|
||||||
|
{
|
||||||
|
if (memcmp(isal_table + i*pg_minsize*item_size, zero_row, pg_minsize*item_size) == 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "BUG or ISA-L incompatibility: EC tables shouldn't have all-zero rows\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
matrices[key] = (reed_sol_matrix_t){
|
matrices[key] = (reed_sol_matrix_t){
|
||||||
.refs = 0,
|
.refs = 0,
|
||||||
.je_data = matrix,
|
.je_data = matrix,
|
||||||
.isal_data = isal_table,
|
.isal_data = isal_table,
|
||||||
|
.isal_item_size = item_size,
|
||||||
};
|
};
|
||||||
rs_it = matrices.find(key);
|
rs_it = matrices.find(key);
|
||||||
}
|
}
|
||||||
|
@ -235,7 +258,7 @@ static reed_sol_matrix_t* get_ec_matrix(int pg_size, int pg_minsize)
|
||||||
// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
|
// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
|
||||||
// the decoding matrix.
|
// the decoding matrix.
|
||||||
// all these flaws are fixed in this function:
|
// all these flaws are fixed in this function:
|
||||||
static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
|
static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, int *item_size)
|
||||||
{
|
{
|
||||||
int edd = 0;
|
int edd = 0;
|
||||||
int erased[pg_size];
|
int erased[pg_size];
|
||||||
|
@ -292,6 +315,7 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
|
||||||
int *erased_copy = (int*)(rectable + 32*smrow*pg_minsize);
|
int *erased_copy = (int*)(rectable + 32*smrow*pg_minsize);
|
||||||
memcpy(erased_copy, erased, pg_size*sizeof(int));
|
memcpy(erased_copy, erased, pg_size*sizeof(int));
|
||||||
matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, rectable);
|
matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, rectable);
|
||||||
|
*item_size = matrix->isal_item_size;
|
||||||
return rectable;
|
return rectable;
|
||||||
#else
|
#else
|
||||||
int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
|
int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
|
||||||
|
@ -355,7 +379,8 @@ static void jerasure_matrix_encode_unaligned(int k, int m, int w, int *matrix, c
|
||||||
#ifdef WITH_ISAL
|
#ifdef WITH_ISAL
|
||||||
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
|
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
|
||||||
{
|
{
|
||||||
uint8_t *dectable = (uint8_t*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
|
int item_size = 0;
|
||||||
|
uint8_t *dectable = (uint8_t*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize, &item_size);
|
||||||
if (!dectable)
|
if (!dectable)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
|
@ -378,7 +403,7 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ec_encode_data(
|
ec_encode_data(
|
||||||
read_end-read_start, pg_minsize, wanted, dectable + wanted_base*32*pg_minsize,
|
read_end-read_start, pg_minsize, wanted, dectable + wanted_base*item_size*pg_minsize,
|
||||||
data_ptrs, data_ptrs + pg_minsize
|
data_ptrs, data_ptrs + pg_minsize
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -433,7 +458,7 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
||||||
#else
|
#else
|
||||||
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
|
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
|
||||||
{
|
{
|
||||||
int *dm_ids = (int*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
|
int *dm_ids = (int*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize, NULL);
|
||||||
if (!dm_ids)
|
if (!dm_ids)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
|
@ -980,7 +1005,7 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||||
{
|
{
|
||||||
int item_size =
|
int item_size =
|
||||||
#ifdef WITH_ISAL
|
#ifdef WITH_ISAL
|
||||||
32;
|
matrix->isal_item_size;
|
||||||
#else
|
#else
|
||||||
sizeof(int);
|
sizeof(int);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -65,7 +65,6 @@ void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oi
|
||||||
.sec_list = {
|
.sec_list = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_LIST,
|
.opcode = OSD_OP_SEC_LIST,
|
||||||
},
|
},
|
||||||
.list_pg = pg_num,
|
.list_pg = pg_num,
|
||||||
|
|
|
@ -79,6 +79,32 @@ void osd_t::exec_secondary(osd_op_t *op)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool osd_t::sec_check_pg_lock(osd_num_t primary_osd, const object_id &oid)
|
||||||
|
{
|
||||||
|
if (!enable_pg_locks)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
pool_id_t pool_id = INODE_POOL(oid.inode);
|
||||||
|
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
||||||
|
if (pool_cfg_it == st_cli.pool_config.end())
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
auto ppg = (pool_pg_num_t){ .pool_id = pool_id, .pg_num = map_to_pg(oid, pool_cfg_it->second.pg_stripe_size) };
|
||||||
|
auto pg_it = pgs.find(ppg);
|
||||||
|
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (pg_it->second.disable_pg_locks)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
auto lock_it = pg_locks.find(ppg);
|
||||||
|
return lock_it != pg_locks.end() && lock_it->second.primary_osd == primary_osd;
|
||||||
|
}
|
||||||
|
|
||||||
void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST &&
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST &&
|
||||||
|
@ -89,23 +115,15 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||||
}
|
}
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||||
{
|
{
|
||||||
int n = cur_op->req.sec_read_bmp.len / sizeof(obj_ver_id);
|
exec_sec_read_bmp(cur_op);
|
||||||
if (n > 0)
|
|
||||||
{
|
|
||||||
obj_ver_id *ov = (obj_ver_id*)cur_op->buf;
|
|
||||||
void *reply_buf = malloc_or_die(n * (8 + clean_entry_bitmap_size));
|
|
||||||
void *cur_buf = reply_buf;
|
|
||||||
for (int i = 0; i < n; i++)
|
|
||||||
{
|
|
||||||
bs->read_bitmap(ov[i].oid, ov[i].version, (uint8_t*)cur_buf + sizeof(uint64_t), (uint64_t*)cur_buf);
|
|
||||||
cur_buf = (uint8_t*)cur_buf + (8 + clean_entry_bitmap_size);
|
|
||||||
}
|
|
||||||
free(cur_op->buf);
|
|
||||||
cur_op->buf = reply_buf;
|
|
||||||
}
|
|
||||||
finish_op(cur_op, n * (8 + clean_entry_bitmap_size));
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_LOCK)
|
||||||
|
{
|
||||||
|
exec_sec_lock(cur_op);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto cl = msgr.clients.at(cur_op->peer_fd);
|
||||||
cur_op->bs_op = new blockstore_op_t();
|
cur_op->bs_op = new blockstore_op_t();
|
||||||
cur_op->bs_op->callback = [this, cur_op](blockstore_op_t* bs_op) { secondary_op_callback(cur_op); };
|
cur_op->bs_op->callback = [this, cur_op](blockstore_op_t* bs_op) { secondary_op_callback(cur_op); };
|
||||||
cur_op->bs_op->opcode = (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ? BS_OP_READ
|
cur_op->bs_op->opcode = (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ? BS_OP_READ
|
||||||
|
@ -121,6 +139,13 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
|
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||||
{
|
{
|
||||||
|
if (!(cur_op->req.sec_rw.flags & OSD_OP_IGNORE_PG_LOCK) &&
|
||||||
|
!sec_check_pg_lock(cl->osd_num, cur_op->req.sec_rw.oid))
|
||||||
|
{
|
||||||
|
cur_op->bs_op->retval = -EPIPE;
|
||||||
|
secondary_op_callback(cur_op);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
|
||||||
{
|
{
|
||||||
// Allocate memory for the read operation
|
// Allocate memory for the read operation
|
||||||
|
@ -143,6 +168,13 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||||
}
|
}
|
||||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE)
|
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE)
|
||||||
{
|
{
|
||||||
|
if (!(cur_op->req.sec_del.flags & OSD_OP_IGNORE_PG_LOCK) &&
|
||||||
|
!sec_check_pg_lock(cl->osd_num, cur_op->req.sec_del.oid))
|
||||||
|
{
|
||||||
|
cur_op->bs_op->retval = -EPIPE;
|
||||||
|
secondary_op_callback(cur_op);
|
||||||
|
return;
|
||||||
|
}
|
||||||
cur_op->bs_op->oid = cur_op->req.sec_del.oid;
|
cur_op->bs_op->oid = cur_op->req.sec_del.oid;
|
||||||
cur_op->bs_op->version = cur_op->req.sec_del.version;
|
cur_op->bs_op->version = cur_op->req.sec_del.version;
|
||||||
#ifdef OSD_STUB
|
#ifdef OSD_STUB
|
||||||
|
@ -157,6 +189,18 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||||
#ifdef OSD_STUB
|
#ifdef OSD_STUB
|
||||||
cur_op->bs_op->retval = 0;
|
cur_op->bs_op->retval = 0;
|
||||||
#endif
|
#endif
|
||||||
|
if (enable_pg_locks && !(cur_op->req.sec_stab.flags & OSD_OP_IGNORE_PG_LOCK))
|
||||||
|
{
|
||||||
|
for (int i = 0; i < cur_op->bs_op->len; i++)
|
||||||
|
{
|
||||||
|
if (!sec_check_pg_lock(cl->osd_num, ((obj_ver_id*)cur_op->buf)[i].oid))
|
||||||
|
{
|
||||||
|
cur_op->bs_op->retval = -EPIPE;
|
||||||
|
secondary_op_callback(cur_op);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||||
{
|
{
|
||||||
|
@ -192,12 +236,99 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_t::exec_sec_read_bmp(osd_op_t *cur_op)
|
||||||
|
{
|
||||||
|
auto cl = msgr.clients.at(cur_op->peer_fd);
|
||||||
|
int n = cur_op->req.sec_read_bmp.len / sizeof(obj_ver_id);
|
||||||
|
if (n > 0)
|
||||||
|
{
|
||||||
|
obj_ver_id *ov = (obj_ver_id*)cur_op->buf;
|
||||||
|
void *reply_buf = malloc_or_die(n * (8 + clean_entry_bitmap_size));
|
||||||
|
void *cur_buf = reply_buf;
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
{
|
||||||
|
if (!sec_check_pg_lock(cl->osd_num, ov[i].oid) &&
|
||||||
|
!(cur_op->req.sec_read_bmp.flags & OSD_OP_IGNORE_PG_LOCK))
|
||||||
|
{
|
||||||
|
free(reply_buf);
|
||||||
|
cur_op->bs_op->retval = -EPIPE;
|
||||||
|
secondary_op_callback(cur_op);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
bs->read_bitmap(ov[i].oid, ov[i].version, (uint8_t*)cur_buf + sizeof(uint64_t), (uint64_t*)cur_buf);
|
||||||
|
cur_buf = (uint8_t*)cur_buf + (8 + clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
free(cur_op->buf);
|
||||||
|
cur_op->buf = reply_buf;
|
||||||
|
}
|
||||||
|
finish_op(cur_op, n * (8 + clean_entry_bitmap_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lock/Unlock PG
|
||||||
|
void osd_t::exec_sec_lock(osd_op_t *cur_op)
|
||||||
|
{
|
||||||
|
cur_op->reply.sec_lock.cur_primary = 0;
|
||||||
|
auto cl = msgr.clients.at(cur_op->peer_fd);
|
||||||
|
if (!cl->osd_num ||
|
||||||
|
cur_op->req.sec_lock.flags != OSD_SEC_LOCK_PG &&
|
||||||
|
cur_op->req.sec_lock.flags != OSD_SEC_UNLOCK_PG ||
|
||||||
|
cur_op->req.sec_lock.pool_id > ((uint64_t)1<<POOL_ID_BITS) ||
|
||||||
|
!cur_op->req.sec_lock.pg_num ||
|
||||||
|
cur_op->req.sec_lock.pg_num > UINT32_MAX)
|
||||||
|
{
|
||||||
|
finish_op(cur_op, -EINVAL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto ppg = (pool_pg_num_t){ .pool_id = (pool_id_t)cur_op->req.sec_lock.pool_id, .pg_num = (pg_num_t)cur_op->req.sec_lock.pg_num };
|
||||||
|
auto pool_cfg_it = st_cli.pool_config.find(ppg.pool_id);
|
||||||
|
if (pool_cfg_it == st_cli.pool_config.end() ||
|
||||||
|
pool_cfg_it->second.real_pg_count < cur_op->req.sec_lock.pg_num)
|
||||||
|
{
|
||||||
|
finish_op(cur_op, -ENOENT);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto lock_it = pg_locks.find(ppg);
|
||||||
|
if (cur_op->req.sec_lock.flags == OSD_SEC_LOCK_PG)
|
||||||
|
{
|
||||||
|
if (lock_it != pg_locks.end() && lock_it->second.primary_osd != cl->osd_num)
|
||||||
|
{
|
||||||
|
cur_op->reply.sec_lock.cur_primary = lock_it->second.primary_osd;
|
||||||
|
finish_op(cur_op, -EBUSY);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto primary_pg_it = pgs.find(ppg);
|
||||||
|
if (primary_pg_it != pgs.end() && primary_pg_it->second.state != PG_OFFLINE)
|
||||||
|
{
|
||||||
|
cur_op->reply.sec_lock.cur_primary = this->osd_num;
|
||||||
|
finish_op(cur_op, -EBUSY);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
pg_locks[ppg] = (osd_pg_lock_t){
|
||||||
|
.primary_osd = cl->osd_num,
|
||||||
|
.state = cur_op->req.sec_lock.pg_state,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
else if (lock_it != pg_locks.end() && lock_it->second.primary_osd == cl->osd_num)
|
||||||
|
{
|
||||||
|
pg_locks.erase(lock_it);
|
||||||
|
}
|
||||||
|
finish_op(cur_op, 0);
|
||||||
|
}
|
||||||
|
|
||||||
void osd_t::exec_show_config(osd_op_t *cur_op)
|
void osd_t::exec_show_config(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
std::string json_err;
|
std::string json_err;
|
||||||
json11::Json req_json = cur_op->req.show_conf.json_len > 0
|
json11::Json req_json = cur_op->req.show_conf.json_len > 0
|
||||||
? json11::Json::parse(std::string((char *)cur_op->buf), json_err)
|
? json11::Json::parse(std::string((char *)cur_op->buf), json_err)
|
||||||
: json11::Json();
|
: json11::Json();
|
||||||
|
auto peer_osd_num = req_json["osd_num"].uint64_value();
|
||||||
|
auto cl = msgr.clients.at(cur_op->peer_fd);
|
||||||
|
cl->osd_num = peer_osd_num;
|
||||||
|
if (req_json["features"]["check_sequencing"].bool_value())
|
||||||
|
{
|
||||||
|
cl->check_sequencing = true;
|
||||||
|
cl->read_op_id = cur_op->req.hdr.id + 1;
|
||||||
|
}
|
||||||
// Expose sensitive configuration values so peers can check them
|
// Expose sensitive configuration values so peers can check them
|
||||||
json11::Json::object wire_config = json11::Json::object {
|
json11::Json::object wire_config = json11::Json::object {
|
||||||
{ "osd_num", osd_num },
|
{ "osd_num", osd_num },
|
||||||
|
@ -210,6 +341,7 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
|
||||||
{ "immediate_commit", (immediate_commit == IMMEDIATE_ALL ? "all" :
|
{ "immediate_commit", (immediate_commit == IMMEDIATE_ALL ? "all" :
|
||||||
(immediate_commit == IMMEDIATE_SMALL ? "small" : "none")) },
|
(immediate_commit == IMMEDIATE_SMALL ? "small" : "none")) },
|
||||||
{ "lease_timeout", etcd_report_interval+(st_cli.max_etcd_attempts*(2*st_cli.etcd_quick_timeout)+999)/1000 },
|
{ "lease_timeout", etcd_report_interval+(st_cli.max_etcd_attempts*(2*st_cli.etcd_quick_timeout)+999)/1000 },
|
||||||
|
{ "features", json11::Json::object{ { "pg_locks", true } } },
|
||||||
};
|
};
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
if (msgr.is_rdma_enabled())
|
if (msgr.is_rdma_enabled())
|
||||||
|
@ -222,7 +354,7 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
|
||||||
bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(), req_json["rdma_max_msg"].uint64_value());
|
bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(), req_json["rdma_max_msg"].uint64_value());
|
||||||
if (ok)
|
if (ok)
|
||||||
{
|
{
|
||||||
auto rc = msgr.clients.at(cur_op->peer_fd)->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
wire_config["rdma_address"] = rc->addr.to_string();
|
wire_config["rdma_address"] = rc->addr.to_string();
|
||||||
wire_config["rdma_max_msg"] = rc->max_msg;
|
wire_config["rdma_max_msg"] = rc->max_msg;
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue